From 03b22e561e6e0306aab78f07dcc1482c27b34b76 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Thu, 14 Jul 2022 15:39:07 +0300
Subject: [PATCH 001/254] [isp] Add ISP command line option.

---
 src/cfg.c    | 5 +++++
 src/cli.c    | 3 +++
 src/uvg266.h | 2 ++
 3 files changed, 10 insertions(+)
diff --git a/src/cfg.c b/src/cfg.c
index cafadcb2..f5763be8 100644
--- a/src/cfg.c
+++ b/src/cfg.c
@@ -207,6 +207,8 @@ int uvg_config_init(uvg_config *cfg)
 
   cfg->lfnst = false;
 
+  cfg->isp = false;
+
   parse_qp_map(cfg, 0);
 
   cfg->jccr = 0;
@@ -1454,6 +1456,9 @@ int uvg_config_parse(uvg_config *cfg, const char *name, const char *value)
   else if OPT("lfnst") {
     cfg->lfnst = atobool(value);
   }
+  else if OPT("isp") {
+    cfg->isp = atobool(value);
+  }
   else if OPT("jccr") {
     cfg->jccr = (bool)atobool(value);
   }
diff --git a/src/cli.c b/src/cli.c
index fa6ee6df..073fd12e 100644
--- a/src/cli.c
+++ b/src/cli.c
@@ -178,6 +178,8 @@ static const struct option long_options[] = {
   { "no-mip",                   no_argument, NULL, 0 },
   { "lfnst",                    no_argument, NULL, 0 },
   { "no-lfnst",                 no_argument, NULL, 0 },
+  { "isp",                      no_argument, NULL, 0 },
+  { "no-isp",                   no_argument, NULL, 0 },
   { "jccr",                     no_argument, NULL, 0 },
   { "no-jccr",                  no_argument, NULL, 0 },
   { "amvr",                     no_argument, NULL, 0 },
@@ -671,6 +673,7 @@ void print_help(void)
     "      --(no-)mip             : Enable matrix weighted intra prediction.\n"
     "      --(no-)lfnst           : Enable low frequency non-separable transform.\n"
     "                                 [disabled]\n"
+    "      --(no-)isp             : Enable intra sub partitions. [disabled]\n"
     "      --mts <string>         : Multiple Transform Selection [off].\n"
     "                               (Currently only implemented for intra\n"
     "                               and has effect only when rd >= 2)\n"
diff --git a/src/uvg266.h b/src/uvg266.h
index 3bec7756..b5103249 100644
--- a/src/uvg266.h
+++ b/src/uvg266.h
@@ -526,6 +526,8 @@ typedef struct uvg_config
   /** \brief enable low frequency non-separable transform */
   int8_t lfnst;
 
+  /** \brief enable intra sub partitions*/
+  int8_t isp;
 
   int8_t jccr;
 

From 9406c5c31d9c9a2b740a5b5e63ef7247fad8d206 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Tue, 19 Jul 2022 16:52:07 +0300
Subject: [PATCH 002/254] [isp] Modify generic intra pred functions to handle
 non-square blocks.

---
 src/intra.c                            |  58 +++++---
 src/strategies/avx2/intra-avx2.c       |  28 ++--
 src/strategies/generic/intra-generic.c | 179 +++++++++++++++++--------
 src/strategies/strategies-intra.h      |   6 +-
 4 files changed, 179 insertions(+), 92 deletions(-)

diff --git a/src/intra.c b/src/intra.c
index 7e742d46..ac3fc220 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -197,6 +197,7 @@ int8_t uvg_intra_get_dir_luma_predictor(
 
 static void intra_filter_reference(
   int_fast8_t log2_width,
+  int_fast8_t log2_height,
   uvg_intra_references *refs)
 {
   if (refs->filtered_initialized) {
@@ -206,6 +207,7 @@ static void intra_filter_reference(
   }
 
   const int_fast8_t ref_width = 2 * (1 << log2_width) + 1;
+  const int_fast8_t ref_height = 2 * (1 << log2_height) + 1;
   uvg_intra_ref *ref = &refs->ref;
   uvg_intra_ref *filtered_ref = &refs->filtered_ref;
 
@@ -213,14 +215,13 @@ static void intra_filter_reference(
   filtered_ref->left[0] = (ref->left[1] + 2 * ref->left[0] + ref->top[1] + 2) >> 2;
   filtered_ref->top[0] = filtered_ref->left[0];
 
-  // TODO: use block height here instead of ref_width
   // Top to bottom
-  for (int_fast8_t y = 1; y < ref_width - 1; ++y) {
+  for (int_fast8_t y = 1; y < ref_height - 1; ++y) {
     uvg_pixel *p = &ref->left[y];
     filtered_ref->left[y] = (p[-1] + 2 * p[0] + p[1] + 2) >> 2;
   }
   // Bottom left (not filtered) 
-  filtered_ref->left[ref_width - 1] = ref->left[ref_width - 1];
+  filtered_ref->left[ref_height - 1] = ref->left[ref_height - 1];
 
   // Left to right
   for (int_fast8_t x = 1; x < ref_width - 1; ++x) {
@@ -234,36 +235,46 @@ static void intra_filter_reference(
 
 /**
 * \brief Generate dc prediction.
-* \param log2_width    Log2 of width, range 2..5.
+* \param cu_loc        CU location and size data.
+* \param color         Color channel.
 * \param ref_top       Pointer to -1 index of above reference, length=width*2+1.
 * \param ref_left      Pointer to -1 index of left reference, length=width*2+1.
 * \param dst           Buffer of size width*width.
 * \param multi_ref_idx Multi reference line index for use with MRL.
 */
 static void intra_pred_dc(
-  const int_fast8_t log2_width,
+  const cu_loc_t* const cu_loc,
+  const color_t color,
   const uvg_pixel *const ref_top,
   const uvg_pixel *const ref_left,
   uvg_pixel *const out_block,
   const uint8_t multi_ref_idx)
 {
-  int_fast8_t width = 1 << log2_width;
+  const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
+  const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
 
   int_fast16_t sum = 0;
-  for (int_fast8_t i = 0; i < width; ++i) {
-    sum += ref_top[i + 1 + multi_ref_idx];
-    sum += ref_left[i + 1 + multi_ref_idx];
+  // Only one loop is done for non-square blocks.
+  // In case of non-square blocks, only the longer reference is summed.
+  if (width >= height) {
+    for (int_fast8_t i = 0; i < width; ++i) {
+      sum += ref_top[i + 1 + multi_ref_idx];
+    }
+  }
+  if (width <= height) {
+    for (int_fast8_t j = 0; j < height; ++j) {
+      sum += ref_left[j + 1 + multi_ref_idx];
+    }
   }
   
   // JVET_K0122
-  // TODO: take non-square blocks into account
-  const int denom     = width << 1;
+  const int denom     = width == height ? width << 1 : MAX(width, height);
   const int divShift  = uvg_math_floor_log2(denom);
   const int divOffset = denom >> 1;
   
   const uvg_pixel dc_val = (sum + divOffset) >> divShift;
   //const uvg_pixel dc_val = (sum + width) >> (log2_width + 1);
-  const int_fast16_t block_size = 1 << (log2_width * 2);
+  const int_fast16_t block_size = width * height;
 
   for (int_fast16_t i = 0; i < block_size; ++i) {
     out_block[i] = dc_val;
@@ -901,31 +912,34 @@ static void mip_predict(
 static void intra_predict_regular(
   const encoder_state_t* const state,
   uvg_intra_references *refs,
-  int_fast8_t log2_width,
+  const cu_loc_t* const cu_loc,
   int_fast8_t mode,
   color_t color,
   uvg_pixel *dst,
   const uint8_t multi_ref_idx)
 {
-  const int_fast8_t width = 1 << log2_width;
+  const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
+  const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
+  const int log2_width = uvg_g_convert_to_bit[width] + 2;
+  const int log2_height = uvg_g_convert_to_bit[height] + 2;
   const uvg_config *cfg = &state->encoder_control->cfg;
 
   // MRL only for luma
   uint8_t multi_ref_index = color == COLOR_Y ? multi_ref_idx : 0;
 
   const uvg_intra_ref *used_ref = &refs->ref;
-  if (cfg->intra_smoothing_disabled || color != COLOR_Y || mode == 1 || width == 4 || multi_ref_index) {
+  if (cfg->intra_smoothing_disabled || color != COLOR_Y || mode == 1 || (width == 4 && height == 4) || multi_ref_index) {
     // For chroma, DC and 4x4 blocks, always use unfiltered reference.
   } else if (mode == 0) {
     // Otherwise, use filtered for planar.
-    if (width * width > 32) {
+    if (width * height > 32) {
       used_ref = &refs->filtered_ref;
     }
   } else {
     // Angular modes use smoothed reference pixels, unless the mode is close
     // to being either vertical or horizontal.
     static const int uvg_intra_hor_ver_dist_thres[8] = {24, 24, 24, 14, 2, 0, 0, 0 };
-    int filter_threshold = uvg_intra_hor_ver_dist_thres[(log2_width + log2_width) >> 1];
+    int filter_threshold = uvg_intra_hor_ver_dist_thres[(log2_width + log2_height) >> 1];
     int dist_from_vert_or_hor = MIN(abs(mode - 50), abs(mode - 18));
     if (dist_from_vert_or_hor > filter_threshold) {
 
@@ -939,15 +953,15 @@ static void intra_predict_regular(
   }
 
   if (used_ref == &refs->filtered_ref && !refs->filtered_initialized) {
-    intra_filter_reference(log2_width, refs);
+    intra_filter_reference(log2_width, log2_height, refs);
   }
 
   if (mode == 0) {
-    uvg_intra_pred_planar(log2_width, used_ref->top, used_ref->left, dst);
+    uvg_intra_pred_planar(cu_loc, color, used_ref->top, used_ref->left, dst);
   } else if (mode == 1) {
-    intra_pred_dc(log2_width, used_ref->top, used_ref->left, dst, multi_ref_index);
+    intra_pred_dc(cu_loc, color, used_ref->top, used_ref->left, dst, multi_ref_index);
   } else {
-    uvg_angular_pred(log2_width, mode, color, used_ref->top, used_ref->left, dst, multi_ref_index);
+    uvg_angular_pred(cu_loc, mode, color, used_ref->top, used_ref->left, dst, multi_ref_index);
   }
 
   // pdpc
@@ -1407,7 +1421,7 @@ void uvg_intra_predict(
       mip_predict(state, refs, width, height, dst, intra_mode, data->pred_cu.intra.mip_is_transposed);
     }
     else {
-      intra_predict_regular(state, refs, uvg_g_convert_to_bit[width] + 2, intra_mode, color, dst, data->pred_cu.intra.multi_ref_idx);
+      intra_predict_regular(state, refs, cu_loc, intra_mode, color, dst, data->pred_cu.intra.multi_ref_idx);
     }
   }
   else {
diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c
index 53282e87..21c5c66f 100644
--- a/src/strategies/avx2/intra-avx2.c
+++ b/src/strategies/avx2/intra-avx2.c
@@ -42,10 +42,9 @@
 #include "strategyselector.h"
 #include "strategies/missing-intel-intrinsics.h"
 
-
  /**
  * \brief Generate angular predictions.
- * \param log2_width    Log2 of width, range 2..5.
+ * \param cu_loc        CU locationand size data.
  * \param intra_mode    Angular mode in range 2..34.
  * \param channel_type  Color channel.
  * \param in_ref_above  Pointer to -1 index of above reference, length=width*2+1.
@@ -54,7 +53,7 @@
  * \param multi_ref_idx Reference line index for use with MRL.
  */
 static void uvg_angular_pred_avx2(
-  const int_fast8_t log2_width,
+  const cu_loc_t* const cu_loc,
   const int_fast8_t intra_mode,
   const int_fast8_t channel_type,
   const uvg_pixel *const in_ref_above,
@@ -62,8 +61,12 @@ static void uvg_angular_pred_avx2(
   uvg_pixel *const dst,
   const uint8_t multi_ref_idx)
 {
-  
-  assert(log2_width >= 2 && log2_width <= 5);
+  const int width = channel_type == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
+  const int height = channel_type == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
+  const int log2_width = uvg_g_convert_to_bit[width] + 2;
+  const int log2_height = uvg_g_convert_to_bit[height] + 2;
+
+  assert((log2_width >= 2 && log2_width <= 5) && (log2_height >= 2 && log2_height <= 5));
   assert(intra_mode >= 2 && intra_mode <= 66);
 
   // TODO: implement handling of MRL
@@ -142,7 +145,6 @@ static void uvg_angular_pred_avx2(
   //uvg_pixel tmp_ref[2 * 128 + 3 + 33 * MAX_REF_LINE:IDX] = { 0 };
   uvg_pixel temp_main[2 * 128 + 3 + 33 * MAX_REF_LINE_IDX] = { 0 };
   uvg_pixel temp_side[2 * 128 + 3 + 33 * MAX_REF_LINE_IDX] = { 0 };
-  const int_fast32_t width = 1 << log2_width;
 
   int32_t pred_mode = intra_mode; // ToDo: handle WAIP
 
@@ -497,20 +499,26 @@ static void uvg_angular_pred_avx2(
 
 /**
  * \brief Generate planar prediction.
- * \param log2_width    Log2 of width, range 2..5.
+ * \param cu_loc        CU location and size data.
+ * \param color         Color channel.
  * \param in_ref_above  Pointer to -1 index of above reference, length=width*2+1.
  * \param in_ref_left   Pointer to -1 index of left reference, length=width*2+1.
  * \param dst           Buffer of size width*width.
  */
 static void uvg_intra_pred_planar_avx2(
-  const int_fast8_t log2_width,
+  const cu_loc_t* const cu_loc,
+  color_t color,
   const uint8_t *const ref_top,
   const uint8_t *const ref_left,
   uint8_t *const dst)
 {
-  assert(log2_width >= 2 && log2_width <= 5);
+  const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
+  const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
+  const int log2_width = uvg_g_convert_to_bit[width] + 2;
+  const int log2_height = uvg_g_convert_to_bit[height] + 2;
+
+  assert((log2_width >= 2 && log2_width <= 5) && (log2_height >= 2 && log2_height <= 5));
 
-  const int_fast8_t width = 1 << log2_width;
   const uint8_t top_right = ref_top[width + 1];
   const uint8_t bottom_left = ref_left[width + 1];
 
diff --git a/src/strategies/generic/intra-generic.c b/src/strategies/generic/intra-generic.c
index 35494b99..179763a7 100644
--- a/src/strategies/generic/intra-generic.c
+++ b/src/strategies/generic/intra-generic.c
@@ -34,6 +34,7 @@
 
 #include <stdlib.h>
 
+#include "cu.h"
 #include "intra.h"
 #include "uvg266.h"
 #include "strategyselector.h"
@@ -42,15 +43,16 @@
 
 /**
  * \brief Generate angular predictions.
- * \param log2_width    Log2 of width, range 2..5.
+ * \param cu_loc        CU location and size data.
  * \param intra_mode    Angular mode in range 2..34.
+ * \param channel_type  Color channel.
  * \param in_ref_above  Pointer to -1 index of above reference, length=width*2+1.
- * \param in_ref_left   Pointer to -1 index of left reference, length=width*2+1.
+ * \param in_ref_left   Pointer to -1 index of left reference, length=height*2+1.
  * \param dst           Buffer of size width*width.
  * \param multi_ref_idx Multi reference line index for use with MRL.
  */
 static void uvg_angular_pred_generic(
-  const int_fast8_t log2_width,
+  const cu_loc_t* const cu_loc,
   const int_fast8_t intra_mode,
   const int_fast8_t channel_type,
   const uvg_pixel *const in_ref_above,
@@ -58,8 +60,12 @@ static void uvg_angular_pred_generic(
   uvg_pixel *const dst,
   const uint8_t multi_ref_idx)
 {
+  const int width  = channel_type == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
+  const int height = channel_type == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
+  const int log2_width = uvg_g_convert_to_bit[width] + 2;
+  const int log2_height = uvg_g_convert_to_bit[height] + 2;
   
-  assert(log2_width >= 2 && log2_width <= 5);
+  assert((log2_width >= 2 && log2_width <= 5) && (log2_height >= 2 && log2_height <= 5));
   assert(intra_mode >= 2 && intra_mode <= 66);
 
   static const int16_t modedisp2sampledisp[32] = { 0,    1,    2,    3,    4,    6,     8,   10,   12,   14,   16,   18,   20,   23,   26,   29,   32,   35,   39,  45,  51,  57,  64,  73,  86, 102, 128, 171, 256, 341, 512, 1024 };
@@ -107,9 +113,8 @@ static void uvg_angular_pred_generic(
 
   // TODO: check the correct size for these arrays when MRL is used
   //uvg_pixel tmp_ref[2 * 128 + 3 + 33 * MAX_REF_LINE_IDX] = { 0 };
-  uvg_pixel temp_main[2 * 128 + 3 + 33 * MAX_REF_LINE_IDX] = { 0 };
-  uvg_pixel temp_side[2 * 128 + 3 + 33 * MAX_REF_LINE_IDX] = { 0 };
-  const int_fast32_t width = 1 << log2_width;
+  uvg_pixel temp_above[2 * 128 + 3 + 33 * MAX_REF_LINE_IDX] = { 0 };
+  uvg_pixel temp_left[2 * 128 + 3 + 33 * MAX_REF_LINE_IDX] = { 0 };
 
   uint32_t pred_mode = intra_mode; // ToDo: handle WAIP
 
@@ -124,7 +129,7 @@ static void uvg_angular_pred_generic(
   // Sample displacement per column in fractions of 32.
   const int_fast8_t sample_disp = (mode_disp < 0 ? -1 : 1) * modedisp2sampledisp[abs(mode_disp)];
   
-  // TODO: replace latter width with height
+  // ISP_TODO: replace latter width with height
   int scale = MIN(2, log2_width - pre_scale[abs(mode_disp)]);
 
   // Pointer for the reference we are interpolating from.
@@ -136,19 +141,32 @@ static void uvg_angular_pred_generic(
   // index 0 in block coordinates.
   if (sample_disp < 0) {
 
-    // TODO: for non-square blocks, separate loops for x and y is needed
-    for (int i = 0; i <= width + 1 + multi_ref_index; i++) {
-      temp_main[width + i] = (vertical_mode ? in_ref_above[i] : in_ref_left[i]);
-      temp_side[width + i] = (vertical_mode ? in_ref_left[i] : in_ref_above[i]);
-    }
+    // ISP_TODO: might be able to use memcpy instead of loops here, should be a bit faster.
+    /*if (vertical_mode) {
+      for (int i = 0; i <= width + 1 + multi_ref_index; i++) {
+        temp_main[width + i] = in_ref_above[i];
+      }
+      for (int j = 0; j <= height + 1 + multi_ref_index; j++) {
+        temp_side[height + j] = in_ref_left[j];
+      }
+    } else {
+      for (int i = 0; i <= width + 1 + multi_ref_index; i++) {
+        temp_side[width + i] = in_ref_above[i];
+      }
+      for (int j = 0; j <= height + 1 + multi_ref_index; j++) {
+        temp_main[height + j] = in_ref_left[j];
+      }
+    }*/
+    memcpy(&temp_above[height], &in_ref_above[0], (width + 2 + multi_ref_index) * sizeof(uvg_pixel));
+    memcpy(&temp_left[width], &in_ref_left[0], (height + 2 + multi_ref_index) * sizeof(uvg_pixel));
 
-    // TODO: take into account non-square blocks
-    ref_main = temp_main + width;
-    ref_side = temp_side + width;
+    ref_main = vertical_mode ? temp_above + height : temp_left + width;
+    ref_side = vertical_mode ? temp_left + width : temp_above + height;
 
     // TODO: for non square blocks, need to check if width or height is used for reference extension
-    for (int i = -width; i <= -1; i++) {
-      ref_main[i] = ref_side[MIN((-i * modedisp2invsampledisp[abs(mode_disp)] + 256) >> 9, width)];
+    int size_side = vertical_mode ? height : width;
+    for (int i = -size_side; i <= -1; i++) {
+      ref_main[i] = ref_side[MIN((-i * modedisp2invsampledisp[abs(mode_disp)] + 256) >> 9, size_side)];
     }
 
     //const uint32_t index_offset = width + 1;
@@ -186,23 +204,27 @@ static void uvg_angular_pred_generic(
   else {
     
     // TODO: again, separate loop needed for non-square blocks
-    for (int i = 0; i <= (width << 1) + multi_ref_index; i++) {
+    /*for (int i = 0; i <= (width << 1) + multi_ref_index; i++) {
       temp_main[i] = (vertical_mode ? in_ref_above[i] : in_ref_left[i]);
       temp_side[i] = (vertical_mode ? in_ref_left[i] : in_ref_above[i]);
-    }
+    }*/
+    memcpy(&temp_above[0], &in_ref_above[0], ((width << 1) + 1 + multi_ref_index) * sizeof(uvg_pixel));
+    memcpy(&temp_left[0], &in_ref_left[0], ((height << 1) + 1 + multi_ref_index) * sizeof(uvg_pixel));
+
+    ref_main = vertical_mode ? temp_above : temp_left;
+    ref_side = vertical_mode ? temp_left : temp_above;
 
     // TODO: this code block will need to change also when non-square blocks are used
-    // const int log2_ratio = 0;
-    const int s = 0;
+    const int log2_ratio = log2_width - log2_height;
+    const int s = MAX(0, vertical_mode ? log2_ratio : -log2_ratio);
     const int max_index = (multi_ref_index << s) + 2;
-    const int ref_length = width << 1;
-    const uvg_pixel val = temp_main[ref_length + multi_ref_index];
+    const int ref_length = vertical_mode ? width << 1 : height << 1;
+    const uvg_pixel val = ref_main[ref_length + multi_ref_index];
     for (int j = 1; j <= max_index; j++) {
-      temp_main[ref_length + multi_ref_index +  j] = val;
+      ref_main[ref_length + multi_ref_index +  j] = val;
     }
 
-    ref_main = temp_main;
-    ref_side = temp_side;
+    
     //// sample_disp >= 0 means we don't need to refer to negative indices,
     //// which means we can just use the references as is.
     //ref_main = (vertical_mode ? in_ref_above : in_ref_left) + 1;
@@ -222,7 +244,7 @@ static void uvg_angular_pred_generic(
   if (sample_disp != 0) {
     // The mode is not horizontal or vertical, we have to do interpolation.
 
-    for (int_fast32_t y = 0, delta_pos = sample_disp * (1 + multi_ref_index); y < width; ++y, delta_pos += sample_disp) {
+    for (int_fast32_t y = 0, delta_pos = sample_disp * (1 + multi_ref_index); y < height; ++y, delta_pos += sample_disp) {
       int_fast32_t delta_int = delta_pos >> 5;
       int_fast32_t delta_fract = delta_pos & (32 - 1);
 
@@ -237,6 +259,7 @@ static void uvg_angular_pred_generic(
           int filter_threshold = uvg_intra_hor_ver_dist_thres[log2_width];
           int dist_from_vert_or_hor = MIN(abs((int32_t)pred_mode - 50), abs((int32_t)pred_mode - 18));
           if (dist_from_vert_or_hor > filter_threshold) {
+            // ISP_TODO: these are introduced in the beginning of this function or am I missing something?
             static const int16_t modedisp2sampledisp[32] = { 0,    1,    2,    3,    4,    6,     8,   10,   12,   14,   16,   18,   20,   23,   26,   29,   32,   35,   39,  45,  51,  57,  64,  73,  86, 102, 128, 171, 256, 341, 512, 1024 };
             const int_fast8_t mode_disp = (pred_mode >= 34) ? pred_mode - 50 : 18 - pred_mode;
             const int_fast8_t sample_disp = (mode_disp < 0 ? -1 : 1) * modedisp2sampledisp[abs(mode_disp)];
@@ -291,7 +314,7 @@ static void uvg_angular_pred_generic(
         }
       }
       if(PDPC_filter) {
-        int       inv_angle_sum = 256;
+        int inv_angle_sum = 256;
         for (int x = 0; x < MIN(3 << scale, width); x++) {
           inv_angle_sum += modedisp2invsampledisp[abs(mode_disp)];
 
@@ -336,31 +359,54 @@ static void uvg_angular_pred_generic(
   }
   else {
     // Mode is horizontal or vertical, just copy the pixels.
+    
+    // Do not apply PDPC if multi ref line index is other than 0
+    // ISP_TODO: do not do PDPC if block is in BDPCM mode
+    bool do_pdpc = (((width >= 4 && height >= 4) || channel_type != 0) && sample_disp >= 0 && multi_ref_index == 0);
 
-    // TODO: update outer loop to use height instead of width
-    for (int_fast32_t y = 0; y < width; ++y) {
-      for (int_fast32_t x = 0; x < width; ++x) {
-        dst[y * width + x] = ref_main[x + 1];
-      }
-      // Do not apply PDPC if multi ref line index is other than 0
-      if ((width >= 4 || channel_type != 0) && sample_disp >= 0 && multi_ref_index == 0) {
-        int scale = (log2_width + log2_width - 2) >> 2;
-        const uvg_pixel top_left = ref_main[0];
+    if (do_pdpc) {
+      int scale = (log2_width + log2_height - 2) >> 2;
+      const uvg_pixel top_left = ref_main[0];
+      for (int_fast32_t y = 0; y < height; ++y) {
+        memcpy(&dst[y * width], &ref_main[1], width * sizeof(uvg_pixel));
         const uvg_pixel left = ref_side[1 + y];
-        for (int i = 0; i < MIN(3 << scale, width); i++) {
-          const int wL = 32 >> (2 * i >> scale);
-          const uvg_pixel val = dst[y * width + i];
-          dst[y * width + i] = CLIP_TO_PIXEL(val + ((wL * (left - top_left) + 32) >> 6));
+        for (int_fast32_t x = 0; x < MIN(3 << scale, width); ++x) {
+          const int wL = 32 >> (2 * x >> scale);
+          const uvg_pixel val = dst[y * width + x];
+          dst[y * width + x] = CLIP_TO_PIXEL(val + ((wL * (left - top_left) + 32) >> 6));
         }
       }
+    } else {
+      for (int_fast32_t y = 0; y < height; ++y) {
+        memcpy(&dst[y * width], &ref_main[1], width * sizeof(uvg_pixel));
+      }
     }
+    // ISP_TODO: there is no reason to run these loops AND then check if PDPC is applied. Do the check first and then run either the normal or PDPC loops
+
+    //for (int_fast32_t y = 0; y < height; ++y) {
+    //  for (int_fast32_t x = 0; x < width; ++x) {
+    //    dst[y * width + x] = ref_main[x + 1];
+    //  }
+    //  // Do not apply PDPC if multi ref line index is other than 0
+    //  // ISP_TODO: do not do PDPC if block is in BDPCM mode
+    //  if (((width >= 4 && height >= 4) || channel_type != 0) && sample_disp >= 0 && multi_ref_index == 0) {
+    //    int scale = (log2_width + log2_height - 2) >> 2;
+    //    const uvg_pixel top_left = ref_main[0];
+    //    const uvg_pixel left = ref_side[1 + y];
+    //    for (int i = 0; i < MIN(3 << scale, width); i++) { // ISP_TODO: is one loop enough for PDPC?
+    //      const int wL = 32 >> (2 * i >> scale);
+    //      const uvg_pixel val = dst[y * width + i];
+    //      dst[y * width + i] = CLIP_TO_PIXEL(val + ((wL * (left - top_left) + 32) >> 6));
+    //    }
+    //  }
+    //}
   }
 
   // Flip the block if this is was a horizontal mode.
   if (!vertical_mode) {
-    for (int_fast32_t y = 0; y < width - 1; ++y) {
+    for (int_fast32_t y = 0; y < height - 1; ++y) {
       for (int_fast32_t x = y + 1; x < width; ++x) {
-        SWAP(dst[y * width + x], dst[x * width + y], uvg_pixel);
+        SWAP(dst[y * width + x], dst[x * height + y], uvg_pixel);
       }
     }
   }
@@ -369,23 +415,31 @@ static void uvg_angular_pred_generic(
 
 /**
  * \brief Generate planar prediction.
- * \param log2_width    Log2 of width, range 2..5.
+ * \param cu_loc        CU location and size data.
+ * \param color         Color channel.
  * \param in_ref_above  Pointer to -1 index of above reference, length=width*2+1.
  * \param in_ref_left   Pointer to -1 index of left reference, length=width*2+1.
  * \param dst           Buffer of size width*width.
  */
 static void uvg_intra_pred_planar_generic(
-  const int_fast8_t log2_width,
+  const cu_loc_t* const cu_loc,
+  color_t color,
   const uvg_pixel *const ref_top,
   const uvg_pixel *const ref_left,
   uvg_pixel *const dst)
 {
-  // TODO: Add height
-  assert(log2_width >= 2 && log2_width <= 5);
+  const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
+  const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
+  const int log2_width = uvg_g_convert_to_bit[width] + 2;
+  const int log2_height = uvg_g_convert_to_bit[height] + 2;
+
+  const int offset = 1 << (log2_width + log2_height);
+  const int final_shift = 1 + log2_width + log2_height;
+  
+  assert((log2_width >= 2 && log2_width <= 5) && (log2_height >= 2 && log2_height <= 5));
 
-  const int_fast8_t width = 1 << log2_width;
   const uvg_pixel top_right = ref_top[width + 1];
-  const uvg_pixel bottom_left = ref_left[width + 1];
+  const uvg_pixel bottom_left = ref_left[height + 1];
 
 #if 0
   // Unoptimized version for reference.
@@ -397,18 +451,27 @@ static void uvg_intra_pred_planar_generic(
     }
   }
 #else
-  int_fast16_t top[32];
+  // TODO: get rid of magic numbers. Make a define for this
+  int_fast16_t top[64];
+  int_fast16_t bottom[64];
+  int_fast16_t left[64];
+  int_fast16_t right[64];
   for (int i = 0; i < width; ++i) {
-    top[i] = ref_top[i + 1] << log2_width;
+    bottom[i] = bottom_left - ref_top[i + 1];
+    top[i] = ref_top[i + 1] << log2_height;
   }
 
-  for (int y = 0; y < width; ++y) {
-    int_fast16_t hor = (ref_left[y + 1] << log2_width) + width;
+  for (int j = 0; j < height; ++j) {
+    right[j] = top_right - ref_left[j + 1];
+    left[j] = ref_left[j + 1] << log2_width;
+  }
+
+  for (int y = 0; y < height; ++y) {
+    int_fast16_t hor = left[y];
     for (int x = 0; x < width; ++x) {
-      hor += top_right - ref_left[y + 1];
-      top[x] += bottom_left - ref_top[x + 1];
-      dst[y * width + x] = (hor + top[x]) >> (log2_width + 1);
-      //
+      hor += right[y];
+      top[x] += bottom[x];
+      dst[y * width + x] = ((hor << log2_height) + (top[x] << log2_width) + offset) >> final_shift;
     }
   }
 #endif
diff --git a/src/strategies/strategies-intra.h b/src/strategies/strategies-intra.h
index 0f7228a0..4f14c376 100644
--- a/src/strategies/strategies-intra.h
+++ b/src/strategies/strategies-intra.h
@@ -38,13 +38,14 @@
  * Interface for intra prediction functions.
  */
 
+#include "cu.h"
 #include "global.h" // IWYU pragma: keep
 #include "intra.h"
 #include "uvg266.h"
 
 
 typedef void (angular_pred_func)(
-  const int_fast8_t log2_width,
+  const cu_loc_t* const cu_loc,
   const int_fast8_t intra_mode,
   const int_fast8_t channel_type,
   const uvg_pixel *const in_ref_above,
@@ -53,7 +54,8 @@ typedef void (angular_pred_func)(
   const uint8_t multi_ref_idx);
 
 typedef void (intra_pred_planar_func)(
-  const int_fast8_t log2_width,
+  const cu_loc_t* const cu_loc,
+  color_t color,
   const uvg_pixel *const ref_top,
   const uvg_pixel *const ref_left,
   uvg_pixel *const dst);

From 96df3ffd642942c833fab653499cf6255e4f0222 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Thu, 21 Jul 2022 14:35:12 +0300
Subject: [PATCH 003/254] [isp] Change function calls to cu_loc_t.

---
 src/intra.c                            | 37 ++++++------
 src/intra.h                            |  2 +-
 src/search_intra.c                     | 80 +++++++++++++-------------
 src/strategies/generic/intra-generic.c | 21 +++----
 src/strategies/strategies-intra.h      |  4 +-
 src/uvg266.h                           |  2 +-
 6 files changed, 77 insertions(+), 69 deletions(-)

diff --git a/src/intra.c b/src/intra.c
index ac3fc220..0611d3f1 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -969,13 +969,13 @@ static void intra_predict_regular(
   bool pdpcCondition = (mode == 0 || mode == 1); // Planar and DC
   if (pdpcCondition && multi_ref_index == 0) // Cannot be used with MRL.
   {
-    uvg_pdpc_planar_dc(mode, width, log2_width, used_ref, dst);
+    uvg_pdpc_planar_dc(mode, cu_loc, color, used_ref, dst);
   }
 }
 
 
 void uvg_intra_build_reference_any(
-  const int_fast8_t log2_width,
+  const cu_loc_t* const cu_loc,
   const color_t color,
   const vector2d_t *const luma_px,
   const vector2d_t *const pic_px,
@@ -984,7 +984,12 @@ void uvg_intra_build_reference_any(
   const uint8_t multi_ref_idx,
   uvg_pixel *extra_ref_lines)
 {
-  assert(log2_width >= 2 && log2_width <= 5);
+  const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
+  const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
+  const int log2_width = uvg_g_convert_to_bit[width] + 2;
+  const int log2_height = uvg_g_convert_to_bit[height] + 2;
+
+  assert((log2_width >= 2 && log2_width <= 5) && (log2_height >= 2 && log2_height <= 5));
 
   refs->filtered_initialized = false;
   uvg_pixel *out_left_ref = &refs->ref.left[0];
@@ -992,8 +997,6 @@ void uvg_intra_build_reference_any(
 
   const uvg_pixel dc_val = 1 << (UVG_BIT_DEPTH - 1); //TODO: add used bitdepth as a variable
   const int is_chroma = color != COLOR_Y ? 1 : 0;
-  // TODO: height for non-square blocks
-  const int_fast8_t width = 1 << log2_width;
 
   // Get multi ref index from CU under prediction or reconstrcution. Do not use MRL if not luma
   const uint8_t multi_ref_index = !is_chroma ? multi_ref_idx : 0;
@@ -1184,7 +1187,7 @@ void uvg_intra_build_reference_any(
 }
 
 void uvg_intra_build_reference_inner(
-  const int_fast8_t log2_width,
+  const cu_loc_t* const cu_loc,
   const color_t color,
   const vector2d_t *const luma_px,
   const vector2d_t *const pic_px,
@@ -1194,15 +1197,18 @@ void uvg_intra_build_reference_inner(
   const uint8_t multi_ref_idx,
   uvg_pixel* extra_ref_lines)
 {
-  assert(log2_width >= 2 && log2_width <= 5);
+  const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
+  const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
+  const int log2_width = uvg_g_convert_to_bit[width] + 2;
+  const int log2_height = uvg_g_convert_to_bit[height] + 2;
+
+  assert((log2_width >= 2 && log2_width <= 5) && (log2_height >= 2 && log2_height <= 5));
 
   refs->filtered_initialized = false;
   uvg_pixel * __restrict out_left_ref = &refs->ref.left[0];
   uvg_pixel * __restrict out_top_ref = &refs->ref.top[0];
 
   const int is_chroma = color != COLOR_Y ? 1 : 0;
-  // TODO: height for non-sqaure blocks
-  const int_fast8_t width = 1 << log2_width;
 
   // Get multiRefIdx from CU under prediction. Do not use MRL if not luma
   const uint8_t multi_ref_index = !is_chroma ? multi_ref_idx : 0;
@@ -1366,7 +1372,7 @@ void uvg_intra_build_reference_inner(
 }
 
 void uvg_intra_build_reference(
-  const int_fast8_t log2_width,
+  const cu_loc_t* const cu_loc,
   const color_t color,
   const vector2d_t *const luma_px,
   const vector2d_t *const pic_px,
@@ -1380,9 +1386,9 @@ void uvg_intra_build_reference(
 
   // Much logic can be discarded if not on the edge
   if (luma_px->x > 0 && luma_px->y > 0) {
-    uvg_intra_build_reference_inner(log2_width, color, luma_px, pic_px, lcu, refs, entropy_sync, multi_ref_idx, extra_ref_lines);
+    uvg_intra_build_reference_inner(cu_loc, color, luma_px, pic_px, lcu, refs, entropy_sync, multi_ref_idx, extra_ref_lines);
   } else {
-    uvg_intra_build_reference_any(log2_width, color, luma_px, pic_px, lcu, refs, multi_ref_idx, extra_ref_lines);
+    uvg_intra_build_reference_any(cu_loc, color, luma_px, pic_px, lcu, refs, multi_ref_idx, extra_ref_lines);
   }
 }
 
@@ -1513,16 +1519,15 @@ static void intra_recon_tb_leaf(
         frame->rec->stride, 1);
     }
   }
-  uvg_intra_build_reference(log2width, color, &luma_px, &pic_px, lcu, &refs, cfg->wpp, extra_refs, multi_ref_index);
-
-  uvg_pixel pred[32 * 32];
-
   cu_loc_t loc = {
     x, y,
     width, height,
     width, height,
   };
 
+  uvg_intra_build_reference(&loc, color, &luma_px, &pic_px, lcu, &refs, cfg->wpp, extra_refs, multi_ref_index);
+
+  uvg_pixel pred[32 * 32];
   uvg_intra_predict(state, &refs, &loc, color, pred, search_data, lcu, tree_type);
 
   const int index = lcu_px.x + lcu_px.y * lcu_width;
diff --git a/src/intra.h b/src/intra.h
index a2ffa230..6c7a648e 100644
--- a/src/intra.h
+++ b/src/intra.h
@@ -107,7 +107,7 @@ int8_t uvg_intra_get_dir_luma_predictor(
 * \param multi_ref_idx Multi reference line index for the prediction block.
 */
 void uvg_intra_build_reference(
-  const int_fast8_t log2_width,
+  const cu_loc_t* const cu_loc,
   const color_t color,
   const vector2d_t *const luma_px,
   const vector2d_t *const pic_px,
diff --git a/src/search_intra.c b/src/search_intra.c
index 226c40c3..1ce4c8a5 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -682,22 +682,32 @@ static int search_intra_chroma_rough(
   int x_px,
   int y_px,
   int depth,
-  const uvg_pixel *orig_u,
-  const uvg_pixel *orig_v,
-  int16_t origstride,
-  uvg_intra_references *refs_u,
-  uvg_intra_references *refs_v,
+  const vector2d_t* const lcu_px,
   intra_search_data_t* chroma_data,
   lcu_t* lcu,
   int8_t luma_mode,
   enum uvg_tree_type tree_type)
 {
   assert(depth != 4 || (x_px & 4 && y_px & 4));
+  const int_fast8_t log2_width_c = MAX(LOG2_LCU_WIDTH - depth - 1, 2);
+  const vector2d_t pic_px = { state->tile->frame->width, state->tile->frame->height };
+  const vector2d_t luma_px = { x_px & ~7, y_px & ~7 };
+  const int width = 1 << log2_width_c;
+  const int height = width; // TODO: height for non-square blocks
 
-  const unsigned width = MAX(LCU_WIDTH_C >> depth, TR_MIN_WIDTH);
+  const cu_loc_t loc = { luma_px.x, luma_px.y, width, height, width, height };
+
+  uvg_intra_references refs_u;
+  uvg_intra_build_reference(&loc, COLOR_U, &luma_px, &pic_px, lcu, &refs_u, state->encoder_control->cfg.wpp, NULL, 0);
+
+  uvg_intra_references refs_v;
+  uvg_intra_build_reference(&loc, COLOR_V, &luma_px, &pic_px, lcu, &refs_v, state->encoder_control->cfg.wpp, NULL, 0);
+
+  vector2d_t lcu_cpx = { (lcu_px->x & ~7) / 2, (lcu_px->y & ~7) / 2 };
+  uvg_pixel* orig_u = &lcu->ref.u[lcu_cpx.x + lcu_cpx.y * LCU_WIDTH_C];
+  uvg_pixel* orig_v = &lcu->ref.v[lcu_cpx.x + lcu_cpx.y * LCU_WIDTH_C];
   
   //cost_pixel_nxn_func *const sad_func = uvg_pixels_get_sad_func(width);
-  cu_loc_t loc = { x_px & ~7, y_px & ~7, width, width, width, width };
     
   uvg_pixel _pred[32 * 32 + SIMD_ALIGNMENT];
   uvg_pixel *pred = ALIGNED_POINTER(_pred, SIMD_ALIGNMENT);
@@ -705,12 +715,12 @@ static int search_intra_chroma_rough(
   uvg_pixel _orig_block[32 * 32 + SIMD_ALIGNMENT];
   uvg_pixel *orig_block = ALIGNED_POINTER(_orig_block, SIMD_ALIGNMENT);
 
-  uvg_pixels_blit(orig_u, orig_block, width, width, origstride, width);
+  uvg_pixels_blit(orig_u, orig_block, width, width, LCU_WIDTH_C, width);
   int modes_count = (state->encoder_control->cfg.cclm ? 8 : 5);
   for (int i = 0; i < modes_count; ++i) {
     const int8_t mode_chroma = chroma_data[i].pred_cu.intra.mode_chroma;
     if (mode_chroma == luma_mode || mode_chroma == 0 || mode_chroma >= 81) continue;
-    uvg_intra_predict(state, refs_u, &loc, COLOR_U, pred, &chroma_data[i], lcu, tree_type);
+    uvg_intra_predict(state, &refs_u, &loc, COLOR_U, pred, &chroma_data[i], lcu, tree_type);
     //costs[i] += get_cost(encoder_state, pred, orig_block, satd_func, sad_func, width);
     switch (width) {
       case 4: chroma_data[i].cost += uvg_satd_4x4(pred, orig_block);
@@ -725,11 +735,11 @@ static int search_intra_chroma_rough(
     }
   }
 
-  uvg_pixels_blit(orig_v, orig_block, width, width, origstride, width);
+  uvg_pixels_blit(orig_v, orig_block, width, width, LCU_WIDTH_C, width);
   for (int i = 0; i < modes_count; ++i) {
     const int8_t mode_chroma = chroma_data[i].pred_cu.intra.mode_chroma;
     if (mode_chroma == luma_mode || mode_chroma == 0 || mode_chroma >= 81) continue;
-    uvg_intra_predict(state, refs_v, &loc, COLOR_V, pred, &chroma_data[i], lcu, tree_type);
+    uvg_intra_predict(state, &refs_v, &loc, COLOR_V, pred, &chroma_data[i], lcu, tree_type);
     //costs[i] += get_cost(encoder_state, pred, orig_block, satd_func, sad_func, width);
     switch (width) {
       case 4: chroma_data[i].cost += uvg_satd_4x4(pred, orig_block);
@@ -1270,8 +1280,15 @@ static void get_rough_cost_for_2n_modes(
 #define PARALLEL_BLKS 2
   assert(num_modes % 2 == 0 && "passing odd number of modes to get_rough_cost_for_2n_modes");
   const int width = cu_loc->width;
-  cost_pixel_nxn_multi_func* satd_dual_func = uvg_pixels_get_satd_dual_func(width);
-  cost_pixel_nxn_multi_func* sad_dual_func = uvg_pixels_get_sad_dual_func(width);
+  const int height = cu_loc->height;
+  cost_pixel_nxn_multi_func* satd_dual_func;
+  cost_pixel_nxn_multi_func* sad_dual_func;
+  if (width == height) {
+    satd_dual_func = uvg_pixels_get_satd_dual_func(width);
+    sad_dual_func = uvg_pixels_get_sad_dual_func(width);
+  } else {
+    assert(false && "Joose promised to fix this.");
+  }
 
   uvg_pixel _preds[PARALLEL_BLKS * MIN(LCU_WIDTH, 64)* MIN(LCU_WIDTH, 64)+ SIMD_ALIGNMENT];
   pred_buffer preds = ALIGNED_POINTER(_preds, SIMD_ALIGNMENT);
@@ -1447,6 +1464,10 @@ int8_t uvg_search_intra_chroma_rdo(
 {
   const bool reconstruct_chroma = (depth != 4) || (x_px & 4 && y_px & 4);
 
+  int log2_width = MAX(LOG2_LCU_WIDTH - depth - 1, 2);
+  int8_t width = 1 << log2_width;
+  int8_t height = 1 << log2_width;
+  const cu_loc_t loc = { x_px & ~7, y_px & ~7, width, height, width, height };
 
   uvg_intra_references refs[2];
   const vector2d_t luma_px = { x_px & ~7, y_px & ~7 };
@@ -1457,16 +1478,13 @@ int8_t uvg_search_intra_chroma_rdo(
 
 
   if (reconstruct_chroma) {
-    int log2_width = MAX(LOG2_LCU_WIDTH - depth - 1, 2);
-    uvg_intra_build_reference(log2_width, COLOR_U, &luma_px, &pic_px, lcu, &refs[0], state->encoder_control->cfg.wpp, NULL, 0);
-    uvg_intra_build_reference(log2_width, COLOR_V, &luma_px, &pic_px, lcu, &refs[1], state->encoder_control->cfg.wpp, NULL, 0);
+    uvg_intra_build_reference(&loc, COLOR_U, &luma_px, &pic_px, lcu, &refs[0], state->encoder_control->cfg.wpp, NULL, 0);
+    uvg_intra_build_reference(&loc, COLOR_V, &luma_px, &pic_px, lcu, &refs[1], state->encoder_control->cfg.wpp, NULL, 0);
     
     const vector2d_t lcu_px = { SUB_SCU(x_px), SUB_SCU(y_px) };
     cabac_data_t temp_cabac;
     memcpy(&temp_cabac, &state->search_cabac, sizeof(cabac_data_t));
-    int8_t width = 1 << log2_width;
-    int8_t height = 1 << log2_width;
-    const cu_loc_t loc = { x_px &~7, y_px & ~7, width, height, width, height};
+    
     const int offset = ((lcu_px.x & ~7) >> 1) + ((lcu_px.y & ~7) >> 1)* LCU_WIDTH_C;
 
     int lfnst_modes_to_check[3];
@@ -1659,26 +1677,10 @@ int8_t uvg_search_cu_intra_chroma(
   // num_modes is 0.is 0.
 
   if(state->encoder_control->cfg.cclm && 0){
-    const int_fast8_t log2_width_c = MAX(LOG2_LCU_WIDTH - depth - 1, 2);
-    const vector2d_t pic_px = { state->tile->frame->width, state->tile->frame->height };
-    const vector2d_t luma_px = { x_px & ~7, y_px & ~7};
-
-    uvg_intra_references refs_u;
-    uvg_intra_build_reference(log2_width_c, COLOR_U, &luma_px, &pic_px, lcu, &refs_u, state->encoder_control->cfg.wpp, NULL, 0);
-
-    uvg_intra_references refs_v;
-    uvg_intra_build_reference(log2_width_c, COLOR_V, &luma_px, &pic_px, lcu, &refs_v, state->encoder_control->cfg.wpp, NULL, 0);
-
-    vector2d_t lcu_cpx = { (lcu_px.x & ~7) / 2, (lcu_px.y & ~7) / 2 };
-    uvg_pixel *ref_u = &lcu->ref.u[lcu_cpx.x + lcu_cpx.y * LCU_WIDTH_C];
-    uvg_pixel *ref_v = &lcu->ref.v[lcu_cpx.x + lcu_cpx.y * LCU_WIDTH_C];
+    
 
     num_modes = search_intra_chroma_rough(state, x_px, y_px, depth,
-                                          ref_u,
-                                          ref_v,
-                                          LCU_WIDTH_C,
-                                          &refs_u,
-                                          &refs_v,
+                                          &lcu_px,
                                           chroma_data,
                                           lcu,
                                           intra_mode,
@@ -1819,7 +1821,7 @@ void uvg_search_cu_intra(
   int8_t num_cand = uvg_intra_get_dir_luma_predictor(x_px, y_px, candidate_modes, cur_cu, left_cu, above_cu);
 
   if (depth > 0) {
-    uvg_intra_build_reference(log2_width, COLOR_Y, &luma_px, &pic_px, lcu, refs, state->encoder_control->cfg.wpp, NULL, 0);
+    uvg_intra_build_reference(&cu_loc, COLOR_Y, &luma_px, &pic_px, lcu, refs, state->encoder_control->cfg.wpp, NULL, 0);
   }
 
   // The maximum number of possible MIP modes depend on block size & shape
@@ -1887,7 +1889,7 @@ void uvg_search_cu_intra(
           frame->rec->stride, 1);
       }
     }
-    uvg_intra_build_reference(log2_width, COLOR_Y, &luma_px, &pic_px, lcu, &refs[line], state->encoder_control->cfg.wpp, extra_refs, line);
+    uvg_intra_build_reference(&cu_loc, COLOR_Y, &luma_px, &pic_px, lcu, &refs[line], state->encoder_control->cfg.wpp, extra_refs, line);
     for(int i = 1; i < INTRA_MPM_COUNT; i++) {
       num_mrl_modes++;
       const int index = (i - 1) + (INTRA_MPM_COUNT -1)*(line-1) + number_of_modes;
diff --git a/src/strategies/generic/intra-generic.c b/src/strategies/generic/intra-generic.c
index 179763a7..4e050f79 100644
--- a/src/strategies/generic/intra-generic.c
+++ b/src/strategies/generic/intra-generic.c
@@ -129,8 +129,8 @@ static void uvg_angular_pred_generic(
   // Sample displacement per column in fractions of 32.
   const int_fast8_t sample_disp = (mode_disp < 0 ? -1 : 1) * modedisp2sampledisp[abs(mode_disp)];
   
-  // ISP_TODO: replace latter width with height
-  int scale = MIN(2, log2_width - pre_scale[abs(mode_disp)]);
+  const int side_size = vertical_mode ? log2_height : log2_width;
+  int scale = MIN(2, side_size - pre_scale[abs(mode_disp)]);
 
   // Pointer for the reference we are interpolating from.
   uvg_pixel *ref_main;
@@ -524,25 +524,26 @@ static void uvg_intra_pred_filtered_dc_generic(
 
 /**
 * \brief Position Dependent Prediction Combination for Planar and DC modes.
-* \param log2_width    Log2 of width, range 2..5.
-* \param width         Block width matching log2_width.
+* \param cu_loc        CU location and size data.
 * \param used_ref      Pointer used reference pixel struct.
 * \param dst           Buffer of size width*width.
 */
 static void uvg_pdpc_planar_dc_generic(
   const int mode,
-  const int width,
-  const int log2_width,
+  const cu_loc_t* const cu_loc,
+  const color_t color,
   const uvg_intra_ref *const used_ref,
   uvg_pixel *const dst)
 {
   assert(mode == 0 || mode == 1);  // planar or DC
+  const int width =  color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
+  const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
+  const int log2_width = uvg_g_convert_to_bit[width] + 2;
+  const int log2_height = uvg_g_convert_to_bit[height] + 2;
 
-  // TODO: replace latter log2_width with log2_height
-  const int scale = ((log2_width - 2 + log2_width - 2 + 2) >> 2);
+  const int scale = (log2_width + log2_height - 2) >> 2;
 
-  // TODO: replace width with height
-  for (int y = 0; y < width; y++) {
+  for (int y = 0; y < height; y++) {
     int wT = 32 >> MIN(31, ((y << 1) >> scale));
     for (int x = 0; x < width; x++) {
       int wL = 32 >> MIN(31, ((x << 1) >> scale));
diff --git a/src/strategies/strategies-intra.h b/src/strategies/strategies-intra.h
index 4f14c376..9708a3d8 100644
--- a/src/strategies/strategies-intra.h
+++ b/src/strategies/strategies-intra.h
@@ -69,8 +69,8 @@ typedef void (intra_pred_filtered_dc_func)(
 
 typedef void (pdpc_planar_dc_func)(
   const int mode,
-  const int width,
-  const int log2_width,
+  const cu_loc_t* const cu_loc,
+  const color_t color,
   const uvg_intra_ref *const used_ref,
   uvg_pixel *const dst);
 
diff --git a/src/uvg266.h b/src/uvg266.h
index b5103249..4ab7ec1f 100644
--- a/src/uvg266.h
+++ b/src/uvg266.h
@@ -452,7 +452,7 @@ typedef struct uvg_config
   /** \brief Flag to enable/disable open GOP configuration */
   int8_t open_gop;
 
-	int32_t vaq; /** \brief Enable variance adaptive quantization*/
+  int32_t vaq; /** \brief Enable variance adaptive quantization*/
 
   /** \brief Type of scaling lists to use */
   int8_t scaling_list;

From ec4909095c8f4bdf830cdbc8cf2cf64206e52bed Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Thu, 21 Jul 2022 16:27:48 +0300
Subject: [PATCH 004/254] [isp] Do not filter references if ISP is used.

---
 src/intra.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/intra.c b/src/intra.c
index 0611d3f1..df7a971b 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -928,7 +928,7 @@ static void intra_predict_regular(
   uint8_t multi_ref_index = color == COLOR_Y ? multi_ref_idx : 0;
 
   const uvg_intra_ref *used_ref = &refs->ref;
-  if (cfg->intra_smoothing_disabled || color != COLOR_Y || mode == 1 || (width == 4 && height == 4) || multi_ref_index) {
+  if (cfg->intra_smoothing_disabled || color != COLOR_Y || mode == 1 || (width == 4 && height == 4) || multi_ref_index || width != height /*Fake ISP*/) {
     // For chroma, DC and 4x4 blocks, always use unfiltered reference.
   } else if (mode == 0) {
     // Otherwise, use filtered for planar.

From 6236cc29beea192bda77ef993d6afcd263377c77 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Fri, 22 Jul 2022 13:10:21 +0300
Subject: [PATCH 005/254] [isp] Fix avx2 function call.

---
 src/strategies/avx2/intra-avx2.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c
index 21c5c66f..79e60def 100644
--- a/src/strategies/avx2/intra-avx2.c
+++ b/src/strategies/avx2/intra-avx2.c
@@ -972,12 +972,16 @@ static void uvg_intra_pred_filtered_dc_avx2(
 */
 static void uvg_pdpc_planar_dc_avx2(
   const int mode,
-  const int width,
-  const int log2_width,
+  const cu_loc_t* const cu_loc,
+  const color_t color,
   const uvg_intra_ref *const used_ref,
   uvg_pixel *const dst)
 {
   assert(mode == 0 || mode == 1);  // planar or DC
+  const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
+  const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
+  const int log2_width = uvg_g_convert_to_bit[width] + 2;
+  const int log2_height = uvg_g_convert_to_bit[height] + 2;
 
   __m256i shuf_mask_byte = _mm256_setr_epi8(
     0, -1, 0, -1, 0, -1, 0, -1,

From 06532dce0219158d7afac72592b37eb61b51bb29 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Fri, 29 Jul 2022 15:36:56 +0300
Subject: [PATCH 006/254] [isp] Implement ISP search and partitioning. Add
 helper function for constructing cu_loc types. WIP stuff for transform.

---
 src/cu.c           |  25 +++++++++++
 src/cu.h           |   3 ++
 src/intra.c        | 103 ++++++++++++++++++++++++++------------------
 src/search.c       |   8 +++-
 src/search.h       |   8 ++++
 src/search_inter.c |  18 ++++++--
 src/search_intra.c | 105 +++++++++++++++++++++++++++++++++++++++++----
 src/search_intra.h |   2 +
 src/transform.c    |  57 ++++++++++++++----------
 src/transform.h    |   3 +-
 10 files changed, 251 insertions(+), 81 deletions(-)

diff --git a/src/cu.c b/src/cu.c
index 40fce65e..f47f5cf3 100644
--- a/src/cu.c
+++ b/src/cu.c
@@ -251,3 +251,28 @@ void uvg_cu_array_copy_from_lcu(cu_array_t* dst, int dst_x, int dst_y, const lcu
     }
   }
 }
+
+/*
+ * \brief Constructs cu_loc_t based on given parameters. Calculates chroma dimensions automatically.
+ *
+ * \param loc     Destination cu_loc.
+ * \param x       Block top left x coordinate.
+ * \param y       Block top left y coordinate.
+ * \param width   Block width.
+ * \param height  Block height.
+*/
+void uvg_cu_loc_ctor(cu_loc_t* loc, int x, int y, int width, int height)
+{
+  assert(x >= 0 && y >= 0 && width >= 0 && height >= 0 && "Cannot give negative coordinates or block dimensions.");
+  assert(!(width > LCU_WIDTH || height > LCU_WIDTH) && "Luma CU dimension exceeds maximum (dim > LCU_WIDTH).");
+  assert(!(width < 4 || height < 4) && "Luma CU dimension smaller than 4."); // TODO: change if luma size 2 is allowed
+  
+  loc->x = x;
+  loc->y = y;
+  loc->width = width;
+  loc->height = height;
+  // TODO: when MTT is implemented, chroma dimensions can be minimum 2.
+  // Chroma width is half of luma width, when not at maximum depth.
+  loc->chroma_width = MAX(width >> 1, 4);
+  loc->chroma_height = MAX(height >> 1, 4);
+}
diff --git a/src/cu.h b/src/cu.h
index ddddaf55..6fe960e7 100644
--- a/src/cu.h
+++ b/src/cu.h
@@ -185,6 +185,7 @@ typedef struct
       uint8_t multi_ref_idx;
       int8_t mip_flag;
       int8_t mip_is_transposed;
+      int8_t isp_mode;
     } intra;
     struct {
       mv_t    mv[2][2];  // \brief Motion vectors for L0 and L1
@@ -206,6 +207,8 @@ typedef struct {
   int8_t chroma_height;
 } cu_loc_t;
 
+void uvg_cu_loc_ctor(cu_loc_t *loc, int x, int y, int width, int height);
+
 
 #define CU_GET_MV_CAND(cu_info_ptr, reflist) \
   (((reflist) == 0) ? (cu_info_ptr)->inter.mv_cand0 : (cu_info_ptr)->inter.mv_cand1)
diff --git a/src/intra.c b/src/intra.c
index df7a971b..cab91005 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -37,6 +37,8 @@
 #include "image.h"
 #include "uvg_math.h"
 #include "mip_data.h"
+#include "search.h"
+#include "search_intra.h"
 #include "strategies/strategies-intra.h"
 #include "tables.h"
 #include "transform.h"
@@ -1471,9 +1473,7 @@ const cu_info_t* uvg_get_co_located_luma_cu(
 
 static void intra_recon_tb_leaf(
   encoder_state_t* const state,
-  int x,
-  int y,
-  int depth,
+  const cu_loc_t* cu_loc,
   lcu_t *lcu,
   color_t color,
   const intra_search_data_t* search_data,
@@ -1482,13 +1482,14 @@ static void intra_recon_tb_leaf(
   const uvg_config *cfg = &state->encoder_control->cfg;
   const int shift = color == COLOR_Y ? 0 : 1;
 
-  int log2width = LOG2_LCU_WIDTH - depth;
-  if (color != COLOR_Y && depth < MAX_PU_DEPTH) {
-    // Chroma width is half of luma width, when not at maximum depth.
-    log2width -= 1;
-  }
-  const int width = 1 << log2width;
-  const int height = width; // TODO: proper height for non-square blocks
+  const int x = cu_loc->x;
+  const int y = cu_loc->y;
+  
+  const int width  = color == COLOR_Y ? cu_loc->width  : cu_loc->chroma_width;
+  const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
+  int log2_width = uvg_g_convert_to_bit[width] + 2;
+  int log2_height = uvg_g_convert_to_bit[height] + 2;
+
   const int lcu_width = LCU_WIDTH >> shift;
 
   const vector2d_t luma_px = { x, y };
@@ -1510,25 +1511,20 @@ static void intra_recon_tb_leaf(
 
     // Copy extra ref lines, including ref line 1 and top left corner.
     for (int i = 0; i < MAX_REF_LINE_IDX; ++i) {
-      int height = (LCU_WIDTH >> depth) * 2 + MAX_REF_LINE_IDX;
-      height = MIN(height, (LCU_WIDTH - lcu_px.y + MAX_REF_LINE_IDX)); // Cut short if on bottom LCU edge. Cannot take references from below since they don't exist.
-      height = MIN(height, pic_px.y - luma_px.y + MAX_REF_LINE_IDX);
+      int ref_height = height * 2 + MAX_REF_LINE_IDX;
+      ref_height = MIN(ref_height, (LCU_WIDTH - lcu_px.y + MAX_REF_LINE_IDX)); // Cut short if on bottom LCU edge. Cannot take references from below since they don't exist.
+      ref_height = MIN(ref_height, pic_px.y - luma_px.y + MAX_REF_LINE_IDX);
       uvg_pixels_blit(&frame->rec->y[(luma_px.y - MAX_REF_LINE_IDX) * frame->rec->stride + luma_px.x - (1 + i)],
         &extra_refs[i * 128],
-        1, height,
+        1, ref_height,
         frame->rec->stride, 1);
     }
   }
-  cu_loc_t loc = {
-    x, y,
-    width, height,
-    width, height,
-  };
 
-  uvg_intra_build_reference(&loc, color, &luma_px, &pic_px, lcu, &refs, cfg->wpp, extra_refs, multi_ref_index);
+  uvg_intra_build_reference(cu_loc, color, &luma_px, &pic_px, lcu, &refs, cfg->wpp, extra_refs, multi_ref_index);
 
   uvg_pixel pred[32 * 32];
-  uvg_intra_predict(state, &refs, &loc, color, pred, search_data, lcu, tree_type);
+  uvg_intra_predict(state, &refs, cu_loc, color, pred, search_data, lcu, tree_type);
 
   const int index = lcu_px.x + lcu_px.y * lcu_width;
   uvg_pixel *block = NULL;
@@ -1548,9 +1544,9 @@ static void intra_recon_tb_leaf(
     default: break;
   }
 
-  uvg_pixels_blit(pred, block , width, width, width, lcu_width);
+  uvg_pixels_blit(pred, block , width, height, width, lcu_width);
   if(color != COLOR_Y && cfg->jccr) {
-    uvg_pixels_blit(pred, block2, width, width, width, lcu_width);
+    uvg_pixels_blit(pred, block2, width, height, width, lcu_width);
   }
 }
 
@@ -1583,6 +1579,7 @@ void uvg_intra_recon_cu(
 {
   const vector2d_t lcu_px = { SUB_SCU(x) >> (tree_type == UVG_CHROMA_T), SUB_SCU(y) >> (tree_type == UVG_CHROMA_T) };
   const int8_t width = LCU_WIDTH >> depth;
+  const int8_t height = width; // TODO: height for non-square blocks.
   if (cur_cu == NULL) {
     cur_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y);
   }
@@ -1620,6 +1617,7 @@ void uvg_intra_recon_cu(
       LCU_GET_CU_AT_PX(lcu, (lcu_px.x + offset) >> (tree_type == UVG_CHROMA_T), (lcu_px.y + offset) >> (tree_type == UVG_CHROMA_T))->cbf,
     };
 
+    // ISP_TODO: does not work with ISP yet, ask Joose when this is relevant.
     if (recon_luma && depth <= MAX_DEPTH) {
       cbf_set_conditionally(&cur_cu->cbf, child_cbfs, depth, COLOR_Y);
     }
@@ -1627,23 +1625,46 @@ void uvg_intra_recon_cu(
       cbf_set_conditionally(&cur_cu->cbf, child_cbfs, depth, COLOR_U);
       cbf_set_conditionally(&cur_cu->cbf, child_cbfs, depth, COLOR_V);
     }
-  } else {
-    const bool has_luma = recon_luma;
-    const bool has_chroma = recon_chroma && (x % 8 == 0 && y % 8 == 0);
-   
-    // Process a leaf TU.
-    if (has_luma) {
-      intra_recon_tb_leaf(state, x, y, depth, lcu, COLOR_Y, search_data, tree_type);
-    }
-    if (has_chroma) {
-      intra_recon_tb_leaf(state, x, y, depth, lcu, COLOR_U, search_data, tree_type);
-      intra_recon_tb_leaf(state, x, y, depth, lcu, COLOR_V, search_data, tree_type);
-    }
-
-    uvg_quantize_lcu_residual(state, has_luma, has_chroma && !(search_data->pred_cu.joint_cb_cr & 3),
-                              search_data->pred_cu.joint_cb_cr & 3 && state->encoder_control->cfg.jccr && has_chroma,
-                              x, y, depth, cur_cu, lcu,
-                              false,
-      tree_type);
+    return;
   }
+  if (search_data->pred_cu.intra.isp_mode != ISP_MODE_NO_ISP && recon_luma ) {
+    // ISP split is done horizontally or vertically depending on ISP mode, 2 or 4 times depending on block dimensions.
+    // Small blocks are split only twice.
+    int split_type = search_data->pred_cu.intra.isp_mode;
+    int part_dim = uvg_get_isp_split_dim(width, height, split_type);
+    int limit = split_type == ISP_MODE_HOR ? height : width;
+    for (int part = 0; part < limit; part + part_dim) {
+      const int part_x = split_type == ISP_MODE_HOR ? x : x + part;
+      const int part_y = split_type == ISP_MODE_HOR ? y + part: y;
+      const int part_w = split_type == ISP_MODE_HOR ? part_dim : width;
+      const int part_h = split_type == ISP_MODE_HOR ? height : part_dim;
+
+      cu_loc_t loc;
+      uvg_cu_loc_ctor(&loc, part_x, part_y, part_w, part_h);
+
+      intra_recon_tb_leaf(state, &loc, lcu, COLOR_Y, search_data, tree_type);
+      uvg_quantize_lcu_residual(state, true, false, false,
+                                &loc, depth, cur_cu, lcu,
+                                false, tree_type);
+    }
+  }
+  const bool has_luma = recon_luma && search_data->pred_cu.intra.isp_mode == ISP_MODE_NO_ISP;
+  const bool has_chroma = recon_chroma && (x % 8 == 0 && y % 8 == 0);
+
+  cu_loc_t loc;
+  uvg_cu_loc_ctor(&loc, x, y, width, height);
+   
+  // Process a leaf TU.
+  if (has_luma) {
+    intra_recon_tb_leaf(state, &loc, lcu, COLOR_Y, search_data, tree_type);
+  }
+  if (has_chroma) {
+    intra_recon_tb_leaf(state, &loc, lcu, COLOR_U, search_data, tree_type);
+    intra_recon_tb_leaf(state, &loc, lcu, COLOR_V, search_data, tree_type);
+  }
+
+  uvg_quantize_lcu_residual(state, has_luma, has_chroma && !(search_data->pred_cu.joint_cb_cr & 3),
+                            search_data->pred_cu.joint_cb_cr & 3 && state->encoder_control->cfg.jccr && has_chroma,
+                            &loc, depth, cur_cu, lcu,
+                            false, tree_type);
 }
diff --git a/src/search.c b/src/search.c
index cb9fc1d1..4fbf33f3 100644
--- a/src/search.c
+++ b/src/search.c
@@ -170,6 +170,7 @@ static void lcu_fill_cu_info(lcu_t *lcu, int x_local, int y_local, int width, in
         to->intra.multi_ref_idx = cu->intra.multi_ref_idx;
         to->intra.mip_flag = cu->intra.mip_flag;
         to->intra.mip_is_transposed = cu->intra.mip_is_transposed;
+        to->intra.isp_mode = cu->intra.isp_mode;
       } else {
         to->skipped   = cu->skipped;
         to->merged    = cu->merged;
@@ -1091,10 +1092,13 @@ static double search_cu(
           inter_zero_coeff_cost = cu_zero_coeff_cost(state, work_tree, x, y, depth) + inter_bitcost * state->lambda;
 
         }
-
+        cu_loc_t loc;
+        const int width = LCU_WIDTH << depth;
+        const int height = width; // TODO: height for non-square blocks
+        uvg_cu_loc_ctor(&loc, x, y, width, height);
         uvg_quantize_lcu_residual(state,
                                   true, has_chroma && !cur_cu->joint_cb_cr,
-                                  cur_cu->joint_cb_cr, x, y,
+                                  cur_cu->joint_cb_cr, &loc,
                                   depth,
                                   NULL,
                                   lcu,
diff --git a/src/search.h b/src/search.h
index 7566fb96..2a5a6867 100644
--- a/src/search.h
+++ b/src/search.h
@@ -77,6 +77,14 @@ typedef struct unit_stats_map_t {
 #define NUM_MIP_MODES_FULL(width, height) (((width) == 4 && (height) == 4) ? 32 : ((width) == 4 || (height) == 4 || ((width) == 8 && (height) == 8) ? 16 : 12))
 #define NUM_MIP_MODES_HALF(width, height) (NUM_MIP_MODES_FULL((width), (height)) >> 1)
 
+// ISP related defines
+#define NUM_ISP_MODES 3
+#define ISP_MODE_NO_ISP 0
+#define ISP_MODE_HOR 1
+#define ISP_MODE_VER 2
+#define SPLIT_TYPE_HOR 1
+#define SPLIT_TYPE_VER 2
+
 void uvg_sort_modes(int8_t *__restrict modes, double *__restrict costs, uint8_t length);
 void uvg_sort_modes_intra_luma(int8_t *__restrict modes, int8_t *__restrict trafo, double *__restrict costs, uint8_t length);
 
diff --git a/src/search_inter.c b/src/search_inter.c
index 6508995f..7922f34b 100644
--- a/src/search_inter.c
+++ b/src/search_inter.c
@@ -1679,6 +1679,7 @@ static void search_pu_inter(encoder_state_t * const state,
   const uvg_config *cfg = &state->encoder_control->cfg;
   const videoframe_t * const frame = state->tile->frame;
   const int width_cu = LCU_WIDTH >> depth;
+  const int height_cu = width_cu; // TODO: non-square blocks
   const int x = PU_GET_X(part_mode, width_cu, x_cu, i_pu);
   const int y = PU_GET_Y(part_mode, width_cu, y_cu, i_pu);
   const int width = PU_GET_W(part_mode, width_cu, i_pu);
@@ -1826,7 +1827,11 @@ static void search_pu_inter(encoder_state_t * const state,
         cur_pu->inter.mv[1][1]  = info->merge_cand[merge_idx].mv[1][1];
         uvg_lcu_fill_trdepth(lcu, x, y, depth, MAX(1, depth), UVG_BOTH_T);
         uvg_inter_recon_cu(state, lcu, x, y, width, true, false);
-        uvg_quantize_lcu_residual(state, true, false, false, x, y, depth, cur_pu, lcu, true, UVG_BOTH_T);
+
+        cu_loc_t loc;
+        uvg_cu_loc_ctor(&loc, x, y, width_cu, height_cu);
+
+        uvg_quantize_lcu_residual(state, true, false, false, &loc, depth, cur_pu, lcu, true, UVG_BOTH_T);
 
         if (cbf_is_set(cur_pu->cbf, depth, COLOR_Y)) {
           continue;
@@ -1836,7 +1841,7 @@ static void search_pu_inter(encoder_state_t * const state,
           uvg_quantize_lcu_residual(state,
                                     false, has_chroma,
                                     false, /*we are only checking for lack of coeffs so no need to check jccr*/
-                                    x, y, depth, cur_pu, lcu,
+                                    &loc, depth, cur_pu, lcu,
                                     true,
             UVG_BOTH_T);
           if (!cbf_is_set_any(cur_pu->cbf, depth)) {
@@ -2151,6 +2156,10 @@ void uvg_cu_cost_inter_rd2(encoder_state_t * const state,
   const int x_px = SUB_SCU(x);
   const int y_px = SUB_SCU(y);
   const int width = LCU_WIDTH >> depth;
+  const int height = width; // TODO: non-square blocks
+  cu_loc_t loc;
+  uvg_cu_loc_ctor(&loc, x, y, width, height);
+
   cabac_data_t cabac_copy;
   memcpy(&cabac_copy, &state->search_cabac, sizeof(cabac_copy));
   cabac_data_t* cabac = &state->search_cabac;
@@ -2198,7 +2207,7 @@ void uvg_cu_cost_inter_rd2(encoder_state_t * const state,
     uvg_quantize_lcu_residual(state,
                               true,
                               false,
-                              false, x, y,
+                              false, &loc,
                               depth,
                               cur_cu,
                               lcu,
@@ -2263,7 +2272,8 @@ void uvg_cu_cost_inter_rd2(encoder_state_t * const state,
   else {
     uvg_quantize_lcu_residual(state,
                               true, reconstruct_chroma,
-                              reconstruct_chroma && state->encoder_control->cfg.jccr, x, y,
+                              reconstruct_chroma && state->encoder_control->cfg.jccr,
+                              &loc,
                               depth,
                               cur_cu,
                               lcu,
diff --git a/src/search_intra.c b/src/search_intra.c
index 1ce4c8a5..f3c8c838 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -49,6 +49,7 @@
 #include "strategies/strategies-picture.h"
 #include "videoframe.h"
 #include "strategies/strategies-quant.h"
+#include "uvg_math.h"
 
 
 // Normalize SAD for comparison against SATD to estimate transform skip
@@ -247,6 +248,76 @@ static void derive_mts_constraints(cu_info_t *const pred_cu,
 }
 
 
+// ISP_TODO: move this function if it is used elsewhere
+bool can_use_isp(const int width, const int height, const int max_tr_size)
+{
+  const int log2_width = uvg_g_convert_to_bit[width] + 2;
+  const int log2_height = uvg_g_convert_to_bit[height] + 2;
+
+  // Each split block must have at least 16 samples.
+  bool not_enough_samples = (log2_width + log2_height <= 4);
+  bool cu_size_larger_than_max_tr_size = width > max_tr_size || height > max_tr_size;
+  if (not_enough_samples || cu_size_larger_than_max_tr_size) {
+    return false;
+  }
+  return true;
+}
+
+
+/**
+* \brief Returns ISP split partition size based on block dimensions and split type.
+*
+* Returns ISP split partition size based on block dimensions and split type.
+* Will fail if resulting partition size has less than 16 samples. 
+*
+* \param width        Block width.
+* \param height       Block height.
+* \param split_type   Horizontal or vertical split.
+*/
+int uvg_get_isp_split_dim(const int width, const int height, const int split_type)
+{
+  bool divide_in_rows = split_type == SPLIT_TYPE_HOR;
+  int split_dim_size, non_split_dim_size, partition_size, div_shift = 2;
+
+  if (divide_in_rows) {
+    split_dim_size = height;
+    non_split_dim_size = width;
+  }
+  else {
+    split_dim_size = width;
+    non_split_dim_size = height;
+  }
+
+  // ISP_TODO: make a define for this. Depends on minimum transform block log2 side length
+  const int min_num_samples = 16; // Minimum allowed number of samples for split block
+  const int factor_to_min_samples = non_split_dim_size < min_num_samples ? min_num_samples >> uvg_math_floor_log2(non_split_dim_size) : 1;
+  partition_size = (split_dim_size >> div_shift) < factor_to_min_samples ? factor_to_min_samples : (split_dim_size >> div_shift);
+
+  assert((uvg_math_floor_log2(partition_size) + uvg_math_floor_log2(non_split_dim_size) < uvg_math_floor_log2(min_num_samples)) &&
+    "Partition has less than allowed minimum number of samples.");
+  return partition_size;
+}
+
+
+// ISP_TODO: move this function if it is used elsewhere
+bool can_use_isp_with_lfnst(const int width, const int height, const int isp_mode)
+{
+  if (isp_mode == ISP_MODE_NO_ISP) {
+    return false;
+  }
+  const int tu_width = isp_mode == ISP_MODE_HOR ? width : uvg_get_isp_split_dim(width, height, SPLIT_TYPE_VER);
+  const int tu_height = isp_mode == ISP_MODE_HOR ? uvg_get_isp_split_dim(width, height, SPLIT_TYPE_HOR) : height;
+
+  // ISP_TODO: make a define for this or use existing
+  const int min_tb_size = 4;
+
+  if (!(tu_width >= min_tb_size && tu_height >= min_tb_size)) {
+    return false;
+  }
+  return true;
+}
+
+
 /**
 * \brief Perform search for best intra transform split configuration.
 *
@@ -325,6 +396,8 @@ static double search_intra_trdepth(
     {
       trafo = 0;
       num_transforms = (mts_enabled ? MTS_TR_NUM : 1);
+      // Do not do MTS search if ISP mode is used
+      num_transforms = pred_cu->intra.isp_mode == ISP_MODE_NO_ISP ? num_transforms : 1;
     }
     const int mts_start = trafo;
     //TODO: height
@@ -360,6 +433,11 @@ static double search_intra_trdepth(
       pred_cu->violates_lfnst_constrained_chroma = false;
       pred_cu->lfnst_last_scan_pos = false;
 
+      if (pred_cu->lfnst_idx != 0) {
+        // Cannot use ISP with LFNST for small blocks
+        pred_cu->intra.isp_mode = can_use_isp_with_lfnst(width, height, pred_cu->intra.isp_mode) ? pred_cu->intra.isp_mode : ISP_MODE_NO_ISP;
+      }
+
       for (trafo = mts_start; trafo < num_transforms; trafo++) {
         pred_cu->tr_idx = trafo;
         pred_cu->tr_skip = trafo == MTS_SKIP;
@@ -1371,18 +1449,27 @@ static int8_t search_intra_rdo(
   enum uvg_tree_type tree_type)
 {
   const int tr_depth = CLIP(1, MAX_PU_DEPTH, depth + state->encoder_control->cfg.tr_depth_intra);
+  const int width = LCU_WIDTH << depth;
+  const int height = width; // TODO: height for non-square blocks
   
   for (int mode = 0; mode < modes_to_check; mode++) {
-    double rdo_bitcost = uvg_luma_mode_bits(state, &search_data[mode].pred_cu, x_px, y_px, depth, lcu);
-    search_data[mode].pred_cu.tr_idx = MTS_TR_NUM;
-    search_data[mode].bits = rdo_bitcost;
-    search_data[mode].cost = rdo_bitcost * state->lambda;
+    bool can_do_isp_search = search_data[mode].pred_cu.intra.mip_flag ? false: true; // Cannot use ISP with MIP
+    can_do_isp_search = search_data[mode].pred_cu.intra.multi_ref_idx == 0 ? can_do_isp_search : false; // Cannot use ISP with MRL
+    int max_isp_modes = can_do_isp_search && can_use_isp(width, height, 64 /*MAX_TR_SIZE*/) && state->encoder_control->cfg.isp ? NUM_ISP_MODES : 1;
 
-    double mode_cost = search_intra_trdepth(state, x_px, y_px, depth, tr_depth, MAX_INT, &search_data[mode], lcu, tree_type);
-    search_data[mode].cost += mode_cost;
-    if (state->encoder_control->cfg.intra_rdo_et && !cbf_is_set_any(search_data[mode].pred_cu.cbf, depth)) {
-      modes_to_check = mode + 1;
-      break;
+    for (int isp_mode = 0; isp_mode < max_isp_modes; ++isp_mode) {
+      search_data[mode].pred_cu.intra.isp_mode = isp_mode;
+      double rdo_bitcost = uvg_luma_mode_bits(state, &search_data[mode].pred_cu, x_px, y_px, depth, lcu);
+      search_data[mode].pred_cu.tr_idx = MTS_TR_NUM;
+      search_data[mode].bits = rdo_bitcost;
+      search_data[mode].cost = rdo_bitcost * state->lambda;
+
+      double mode_cost = search_intra_trdepth(state, x_px, y_px, depth, tr_depth, MAX_INT, &search_data[mode], lcu, tree_type);
+      search_data[mode].cost += mode_cost;
+      if (state->encoder_control->cfg.intra_rdo_et && !cbf_is_set_any(search_data[mode].pred_cu.cbf, depth)) {
+        modes_to_check = mode + 1;
+        break;
+      }
     }
   }
 
diff --git a/src/search_intra.h b/src/search_intra.h
index 36470e63..307b5ad9 100644
--- a/src/search_intra.h
+++ b/src/search_intra.h
@@ -66,4 +66,6 @@ void uvg_search_cu_intra(
   lcu_t *lcu,
   enum uvg_tree_type tree_type);
 
+int uvg_get_isp_split_dim(const int width, const int height, const int split_type);
+
 #endif // SEARCH_INTRA_H_
diff --git a/src/transform.c b/src/transform.c
index c0adc121..abf793c2 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -1105,14 +1105,15 @@ int uvg_quantize_residual_trskip(
 static void quantize_tr_residual(
   encoder_state_t * const state,
   const color_t color,
-  const int32_t x,
-  const int32_t y,
+  const cu_loc_t *cu_loc,
   const uint8_t depth,
   cu_info_t *cur_pu,
   lcu_t* lcu,
   bool early_skip,
   enum uvg_tree_type tree_type)
 {
+  const int x = cu_loc->x;
+  const int y = cu_loc->y;
   const uvg_config *cfg    = &state->encoder_control->cfg;
   const int32_t shift      = color == COLOR_Y ? 0 : 1;
   const vector2d_t lcu_px  = { SUB_SCU(x) >> shift, SUB_SCU(y) >> shift};
@@ -1130,13 +1131,9 @@ static void quantize_tr_residual(
   // This should ensure that the CBF data doesn't get corrupted if this function
   // is called more than once.
 
-  int32_t tr_width;
-  if (color == COLOR_Y) {
-    tr_width = LCU_WIDTH >> depth;
-  } else {
-    const int chroma_depth = (depth == MAX_PU_DEPTH ? depth - 1 : depth);
-    tr_width = LCU_WIDTH_C >> chroma_depth;
-  }
+  int32_t tr_width  = color == COLOR_Y ? cu_loc->width  : cu_loc->chroma_width;
+  int32_t tr_height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
+  
   const int32_t lcu_width = LCU_WIDTH >> shift;
   const int8_t mode =
     (color == COLOR_Y) ? cur_pu->intra.mode : cur_pu->intra.mode_chroma;
@@ -1287,15 +1284,18 @@ void uvg_quantize_lcu_residual(
   const bool luma,
   const bool chroma,
   const bool jccr,
-  const int32_t x,
-  const int32_t y,
+  const cu_loc_t * cu_loc,
   const uint8_t depth,
   cu_info_t *cur_pu,
   lcu_t* lcu,
   bool early_skip,
   enum uvg_tree_type tree_type)
 {
-  const int32_t width = LCU_WIDTH >> depth;
+  const int x = cu_loc->x;
+  const int y = cu_loc->y;
+  const int width = cu_loc->width;
+  const int height = cu_loc->height;
+  
   const vector2d_t lcu_px  = { SUB_SCU(x), SUB_SCU(y) };
 
   if (cur_pu == NULL) {
@@ -1324,14 +1324,22 @@ void uvg_quantize_lcu_residual(
 
     // Split transform and increase depth
     const int offset = width / 2;
-    const int32_t x2 = x + offset;
-    const int32_t y2 = y + offset;
+    for (int j = 0; j < 2; ++j) {
+      for (int i = 0; i < 2; ++i) {
+        const cu_loc_t loc;
+        uvg_cu_loc_ctor(&loc, (x + i * offset), (y + j * offset), width, height);
+        // jccr is currently not supported if transform is split
+        uvg_quantize_lcu_residual(state, luma, chroma, 0, &loc, depth + 1, NULL, lcu, early_skip, tree_type);
+      }
+    }
 
-    // jccr is currently not supported if transform is split
-    uvg_quantize_lcu_residual(state, luma, chroma, 0,  x,  y, depth + 1, NULL, lcu, early_skip, tree_type);
-    uvg_quantize_lcu_residual(state, luma, chroma, 0, x2,  y, depth + 1, NULL, lcu, early_skip, tree_type);
-    uvg_quantize_lcu_residual(state, luma, chroma, 0,  x, y2, depth + 1, NULL, lcu, early_skip, tree_type);
-    uvg_quantize_lcu_residual(state, luma, chroma, 0, x2, y2, depth + 1, NULL, lcu, early_skip, tree_type);
+    //const int32_t x2 = x + offset;
+    //const int32_t y2 = y + offset;
+
+    //uvg_quantize_lcu_residual(state, luma, chroma, 0,  x,  y, depth + 1, NULL, lcu, early_skip, tree_type);
+    //uvg_quantize_lcu_residual(state, luma, chroma, 0, x2,  y, depth + 1, NULL, lcu, early_skip, tree_type);
+    //uvg_quantize_lcu_residual(state, luma, chroma, 0,  x, y2, depth + 1, NULL, lcu, early_skip, tree_type);
+    //uvg_quantize_lcu_residual(state, luma, chroma, 0, x2, y2, depth + 1, NULL, lcu, early_skip, tree_type);
 
     // Propagate coded block flags from child CUs to parent CU.
     uint16_t child_cbfs[3] = {
@@ -1348,15 +1356,18 @@ void uvg_quantize_lcu_residual(
 
   } else {
     // Process a leaf TU.
+    cu_loc_t loc;
+    uvg_cu_loc_ctor(&loc, x, y, width, height);
+
     if (luma) {
-      quantize_tr_residual(state, COLOR_Y, x, y, depth, cur_pu, lcu, early_skip, tree_type);
+      quantize_tr_residual(state, COLOR_Y, &loc, depth, cur_pu, lcu, early_skip, tree_type);
     }
     if (chroma) {
-      quantize_tr_residual(state, COLOR_U, x, y, depth, cur_pu, lcu, early_skip, tree_type);
-      quantize_tr_residual(state, COLOR_V, x, y, depth, cur_pu, lcu, early_skip, tree_type);   
+      quantize_tr_residual(state, COLOR_U, &loc, depth, cur_pu, lcu, early_skip, tree_type);
+      quantize_tr_residual(state, COLOR_V, &loc, depth, cur_pu, lcu, early_skip, tree_type);   
     }
     if (jccr && cur_pu->tr_depth == cur_pu->depth) {
-      quantize_tr_residual(state, COLOR_UV, x, y, depth, cur_pu, lcu, early_skip, tree_type);
+      quantize_tr_residual(state, COLOR_UV, &loc, depth, cur_pu, lcu, early_skip, tree_type);
     }
     if(chroma && jccr && cur_pu->tr_depth == cur_pu->depth) {
       assert( 0 && "Trying to quantize both jccr and regular at the same time.\n");
diff --git a/src/transform.h b/src/transform.h
index d3f44edf..61c50c04 100644
--- a/src/transform.h
+++ b/src/transform.h
@@ -89,8 +89,7 @@ void uvg_quantize_lcu_residual(
   bool luma,
   bool chroma,
   const bool jccr,
-  int32_t x,
-  int32_t y,
+  const cu_loc_t* cu_loc,
   uint8_t depth,
   cu_info_t *cur_cu,
   lcu_t* lcu,

From 626c9b02eaf45d6e5923a351a9ce879ecde546d9 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Wed, 3 Aug 2022 13:23:27 +0300
Subject: [PATCH 007/254] [isp] Modify transform and quantization functions to
 handle non-square blocks. Add strategy headers to CMakelist.

---
 CMakeLists.txt                           |  2 +-
 src/cu.h                                 |  4 +-
 src/search.c                             | 28 +++++-----
 src/search_inter.c                       |  2 +
 src/search_intra.c                       | 15 +++---
 src/strategies/avx2/dct-avx2.c           |  4 +-
 src/strategies/avx2/intra-avx2.c         |  3 ++
 src/strategies/avx2/picture-avx2.c       |  4 +-
 src/strategies/avx2/quant-avx2.c         | 12 ++---
 src/strategies/generic/dct-generic.c     | 18 +++++--
 src/strategies/generic/picture-generic.c |  4 +-
 src/strategies/generic/quant-generic.c   | 35 ++++++------
 src/strategies/generic/quant-generic.h   |  3 +-
 src/strategies/strategies-dct.c          | 10 +++-
 src/strategies/strategies-dct.h          |  4 +-
 src/strategies/strategies-picture.h      |  2 +-
 src/strategies/strategies-quant.h        | 20 +++++--
 src/transform.c                          | 68 ++++++++++++++----------
 src/transform.h                          |  5 +-
 tests/mts_tests.c                        |  6 +--
 20 files changed, 153 insertions(+), 96 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c0ec99c7..ab0b63a6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -105,7 +105,7 @@ file(GLOB LIB_SOURCES RELATIVE ${PROJECT_SOURCE_DIR} "src/*.h" "src/*.c")
 list(REMOVE_ITEM LIB_SOURCES "src/encmain.c" "src/cli.c" "src/cli.h" "src/yuv_io.c" "src/yuv_io.h")
 
 # Add also all the strategies
-file(GLOB_RECURSE LIB_SOURCES_STRATEGIES RELATIVE ${PROJECT_SOURCE_DIR} "src/strategies/*.c")
+file(GLOB_RECURSE LIB_SOURCES_STRATEGIES RELATIVE ${PROJECT_SOURCE_DIR} "src/strategies/*.h" "src/strategies/*.c")
 
 # ToDo: do something with encode_coding_tree-avx2, currently not converted to VVC
 list(REMOVE_ITEM LIB_SOURCES_STRATEGIES "src/strategies/avx2/encode_coding_tree-avx2.c")
diff --git a/src/cu.h b/src/cu.h
index 6fe960e7..f5eeb5e6 100644
--- a/src/cu.h
+++ b/src/cu.h
@@ -415,9 +415,9 @@ void uvg_cu_array_copy_from_lcu(cu_array_t* dst, int dst_x, int dst_y, const lcu
  */
 static INLINE void copy_coeffs(const coeff_t *__restrict src,
                                coeff_t *__restrict dest,
-                               size_t width)
+                               size_t width, size_t height)
 {
-  memcpy(dest, src, width * width * sizeof(coeff_t));
+  memcpy(dest, src, width * height * sizeof(coeff_t));
 }
 
 
diff --git a/src/search.c b/src/search.c
index 4fbf33f3..56e07b06 100644
--- a/src/search.c
+++ b/src/search.c
@@ -89,20 +89,20 @@ static INLINE void copy_cu_pixels(int x_local, int y_local, int width, lcu_t *fr
   }
 }
 
-static INLINE void copy_cu_coeffs(int x_local, int y_local, int width, lcu_t *from, lcu_t *to, bool joint, enum
+static INLINE void copy_cu_coeffs(const cu_loc_t *cu_loc, lcu_t *from, lcu_t *to, bool joint, enum
                                   uvg_tree_type tree_type)
 {
   if (tree_type != UVG_CHROMA_T) {
-    const int luma_z = xy_to_zorder(LCU_WIDTH, x_local, y_local);
-    copy_coeffs(&from->coeff.y[luma_z], &to->coeff.y[luma_z], width);
+    const int luma_z = xy_to_zorder(LCU_WIDTH, cu_loc->x, cu_loc->y);
+    copy_coeffs(&from->coeff.y[luma_z], &to->coeff.y[luma_z], cu_loc->width, cu_loc->height);
   }
 
   if (from->rec.chroma_format != UVG_CSP_400 && tree_type != UVG_LUMA_T) {
-    const int chroma_z = xy_to_zorder(LCU_WIDTH_C, x_local >> (tree_type != UVG_CHROMA_T), y_local >> (tree_type != UVG_CHROMA_T));
-    copy_coeffs(&from->coeff.u[chroma_z], &to->coeff.u[chroma_z], width >> 1);
-    copy_coeffs(&from->coeff.v[chroma_z], &to->coeff.v[chroma_z], width >> 1);
+    const int chroma_z = xy_to_zorder(LCU_WIDTH_C, cu_loc->x >> (tree_type != UVG_CHROMA_T), cu_loc->y >> (tree_type != UVG_CHROMA_T));
+    copy_coeffs(&from->coeff.u[chroma_z], &to->coeff.u[chroma_z], cu_loc->chroma_width, cu_loc->chroma_height);
+    copy_coeffs(&from->coeff.v[chroma_z], &to->coeff.v[chroma_z], cu_loc->chroma_width, cu_loc->chroma_height);
     if (joint) {
-      copy_coeffs(&from->coeff.joint_uv[chroma_z], &to->coeff.joint_uv[chroma_z], width >> 1);
+      copy_coeffs(&from->coeff.joint_uv[chroma_z], &to->coeff.joint_uv[chroma_z], cu_loc->chroma_width, cu_loc->chroma_height);
     }
   }
 }
@@ -114,9 +114,11 @@ static void work_tree_copy_up(int x_local, int y_local, int depth, lcu_t *work_t
                               uvg_tree_type tree_type)
 {
   const int width = LCU_WIDTH >> depth;
+  cu_loc_t loc;
+  uvg_cu_loc_ctor(&loc, x_local, y_local, width, width);
   copy_cu_info  (x_local, y_local, width, &work_tree[depth + 1], &work_tree[depth]);
   copy_cu_pixels(x_local, y_local, width, &work_tree[depth + 1], &work_tree[depth], tree_type);
-  copy_cu_coeffs(x_local, y_local, width, &work_tree[depth + 1], &work_tree[depth], joint, tree_type);
+  copy_cu_coeffs(&loc, &work_tree[depth + 1], &work_tree[depth], joint, tree_type);
   
 }
 
@@ -1093,7 +1095,7 @@ static double search_cu(
 
         }
         cu_loc_t loc;
-        const int width = LCU_WIDTH << depth;
+        const int width = LCU_WIDTH >> depth;
         const int height = width; // TODO: height for non-square blocks
         uvg_cu_loc_ctor(&loc, x, y, width, height);
         uvg_quantize_lcu_residual(state,
@@ -1579,7 +1581,7 @@ void uvg_search_lcu(encoder_state_t * const state, const int x, const int y, con
   copy_lcu_to_cu_data(state, x, y, &work_tree[0], tree_type);
 
   // Copy coeffs to encoder state.
-  copy_coeffs(work_tree[0].coeff.y, coeff->y, LCU_WIDTH);
+  copy_coeffs(work_tree[0].coeff.y, coeff->y, LCU_WIDTH, LCU_WIDTH);
 
   if(state->frame->slicetype == UVG_SLICE_I && state->encoder_control->cfg.dual_tree) {
     cost = search_cu(
@@ -1596,9 +1598,9 @@ void uvg_search_lcu(encoder_state_t * const state, const int x, const int y, con
     copy_lcu_to_cu_data(state, x, y, &work_tree[0], UVG_CHROMA_T);
   }
 
-  copy_coeffs(work_tree[0].coeff.u, coeff->u, LCU_WIDTH_C);
-  copy_coeffs(work_tree[0].coeff.v, coeff->v, LCU_WIDTH_C);
+  copy_coeffs(work_tree[0].coeff.u, coeff->u, LCU_WIDTH_C, LCU_WIDTH_C);
+  copy_coeffs(work_tree[0].coeff.v, coeff->v, LCU_WIDTH_C, LCU_WIDTH_C);
   if (state->encoder_control->cfg.jccr) {
-    copy_coeffs(work_tree[0].coeff.joint_uv, coeff->joint_uv, LCU_WIDTH_C);
+    copy_coeffs(work_tree[0].coeff.joint_uv, coeff->joint_uv, LCU_WIDTH_C, LCU_WIDTH_C);
   }
 }
diff --git a/src/search_inter.c b/src/search_inter.c
index 7922f34b..ff511740 100644
--- a/src/search_inter.c
+++ b/src/search_inter.c
@@ -2225,6 +2225,7 @@ void uvg_cu_cost_inter_rd2(encoder_state_t * const state,
       u_pred,
       u_resi,
       width,
+      height,
       LCU_WIDTH_C,
       width);
     uvg_generate_residual(
@@ -2232,6 +2233,7 @@ void uvg_cu_cost_inter_rd2(encoder_state_t * const state,
       v_pred,
       v_resi,
       width,
+      height,
       LCU_WIDTH_C,
       width);
 
diff --git a/src/search_intra.c b/src/search_intra.c
index f3c8c838..06b86cc7 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -249,8 +249,11 @@ static void derive_mts_constraints(cu_info_t *const pred_cu,
 
 
 // ISP_TODO: move this function if it is used elsewhere
-bool can_use_isp(const int width, const int height, const int max_tr_size)
+static INLINE bool can_use_isp(const int width, const int height, const int max_tr_size)
 {
+  assert(!(width > LCU_WIDTH || height > LCU_WIDTH) && "Block size larger than max LCU size.");
+  assert(!(width < TR_MIN_WIDTH || height < TR_MIN_WIDTH) && "Block size smaller than min TR_WIDTH.");
+
   const int log2_width = uvg_g_convert_to_bit[width] + 2;
   const int log2_height = uvg_g_convert_to_bit[height] + 2;
 
@@ -300,16 +303,14 @@ int uvg_get_isp_split_dim(const int width, const int height, const int split_typ
 
 
 // ISP_TODO: move this function if it is used elsewhere
-bool can_use_isp_with_lfnst(const int width, const int height, const int isp_mode)
+static INLINE bool can_use_isp_with_lfnst(const int width, const int height, const int isp_mode)
 {
   if (isp_mode == ISP_MODE_NO_ISP) {
     return false;
   }
   const int tu_width = isp_mode == ISP_MODE_HOR ? width : uvg_get_isp_split_dim(width, height, SPLIT_TYPE_VER);
   const int tu_height = isp_mode == ISP_MODE_HOR ? uvg_get_isp_split_dim(width, height, SPLIT_TYPE_HOR) : height;
-
-  // ISP_TODO: make a define for this or use existing
-  const int min_tb_size = 4;
+  const int min_tb_size = TR_MIN_WIDTH; 
 
   if (!(tu_width >= min_tb_size && tu_height >= min_tb_size)) {
     return false;
@@ -1449,7 +1450,7 @@ static int8_t search_intra_rdo(
   enum uvg_tree_type tree_type)
 {
   const int tr_depth = CLIP(1, MAX_PU_DEPTH, depth + state->encoder_control->cfg.tr_depth_intra);
-  const int width = LCU_WIDTH << depth;
+  const int width = LCU_WIDTH >> depth;
   const int height = width; // TODO: height for non-square blocks
   
   for (int mode = 0; mode < modes_to_check; mode++) {
@@ -1633,6 +1634,7 @@ int8_t uvg_search_intra_chroma_rdo(
             u_pred,
             u_resi,
             width,
+            height,
             LCU_WIDTH_C,
             width);
           uvg_generate_residual(
@@ -1640,6 +1642,7 @@ int8_t uvg_search_intra_chroma_rdo(
             v_pred,
             v_resi,
             width,
+            height,
             LCU_WIDTH_C,
             width);
           uvg_chorma_ts_out_t chorma_ts_out;
diff --git a/src/strategies/avx2/dct-avx2.c b/src/strategies/avx2/dct-avx2.c
index b695273b..f3c812ed 100644
--- a/src/strategies/avx2/dct-avx2.c
+++ b/src/strategies/avx2/dct-avx2.c
@@ -1590,18 +1590,20 @@ static void mts_dct_avx2(
   const color_t color,
   const cu_info_t* tu,
   const int8_t width,
+  const int8_t height,
   const int16_t* input,
   int16_t* output,
   const int8_t mts_idx)
 {
   tr_type_t type_hor;
   tr_type_t type_ver;
+  // ISP_TODO: height passed but not used
 
   uvg_get_tr_type(width, color, tu, &type_hor, &type_ver, mts_idx);
 
   if (type_hor == DCT2 && type_ver == DCT2 && !tu->lfnst_idx)
   {
-    dct_func* dct_func = uvg_get_dct_func(width, color, tu->type);
+    dct_func* dct_func = uvg_get_dct_func(width, height, color, tu->type);
     dct_func(bitdepth, input, output);
   }
   else
diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c
index 79e60def..fc19654a 100644
--- a/src/strategies/avx2/intra-avx2.c
+++ b/src/strategies/avx2/intra-avx2.c
@@ -61,6 +61,7 @@ static void uvg_angular_pred_avx2(
   uvg_pixel *const dst,
   const uint8_t multi_ref_idx)
 {
+  // ISP_TODO: non-square block implementation, height is passed but not used
   const int width = channel_type == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
   const int height = channel_type == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
   const int log2_width = uvg_g_convert_to_bit[width] + 2;
@@ -512,6 +513,7 @@ static void uvg_intra_pred_planar_avx2(
   const uint8_t *const ref_left,
   uint8_t *const dst)
 {
+  // ISP_TODO: non-square block implementation, height is passed but not used
   const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
   const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
   const int log2_width = uvg_g_convert_to_bit[width] + 2;
@@ -977,6 +979,7 @@ static void uvg_pdpc_planar_dc_avx2(
   const uvg_intra_ref *const used_ref,
   uvg_pixel *const dst)
 {
+  // ISP_TODO: non-square block implementation, height is passed but not used
   assert(mode == 0 || mode == 1);  // planar or DC
   const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
   const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
diff --git a/src/strategies/avx2/picture-avx2.c b/src/strategies/avx2/picture-avx2.c
index df90f149..a911928d 100644
--- a/src/strategies/avx2/picture-avx2.c
+++ b/src/strategies/avx2/picture-avx2.c
@@ -1743,8 +1743,8 @@ static INLINE __m128i get_residual_8x1_avx2(const uint8_t* a_in, const uint8_t*
   return diff;
 }
 
-static void generate_residual_avx2(const uint8_t* ref_in, const uint8_t* pred_in, int16_t* residual, int width, int ref_stride, int pred_stride) {
-
+static void generate_residual_avx2(const uint8_t* ref_in, const uint8_t* pred_in, int16_t* residual, int width, int height, int ref_stride, int pred_stride) {
+  // ISP_TODO: non-square block implementation, height is passed but not used
   __m128i diff = _mm_setzero_si128();
   switch (width) {
   case 4:
diff --git a/src/strategies/avx2/quant-avx2.c b/src/strategies/avx2/quant-avx2.c
index 5c39fe11..8313b1f0 100644
--- a/src/strategies/avx2/quant-avx2.c
+++ b/src/strategies/avx2/quant-avx2.c
@@ -626,7 +626,7 @@ static void get_quantized_recon_avx2(int16_t *residual, const uint8_t *pred_in,
 * \returns  Whether coeff_out contains any non-zero coefficients.
 */
 int uvg_quantize_residual_avx2(encoder_state_t *const state,
-  const cu_info_t *const cur_cu, const int width, const color_t color,
+  const cu_info_t *const cur_cu, const int width, const int height, const color_t color,
   const coeff_scan_order_t scan_order, const int use_trskip,
   const int in_stride, const int out_stride,
   const uint8_t *const ref_in, const uint8_t *const pred_in,
@@ -637,15 +637,15 @@ int uvg_quantize_residual_avx2(encoder_state_t *const state,
   // Temporary arrays to pass data to and from uvg_quant and transform functions.
   ALIGNED(64) int16_t residual[TR_MAX_WIDTH * TR_MAX_WIDTH];
   ALIGNED(64) coeff_t coeff[TR_MAX_WIDTH * TR_MAX_WIDTH];
-
-  const int height = width; // TODO: height for non-square blocks
+  // ISP_TODO: non-square block implementation, height is passed but not used
+  
   int has_coeffs = 0;
 
   assert(width <= TR_MAX_WIDTH);
   assert(width >= TR_MIN_WIDTH);
 
   // Get residual. (ref_in - pred_in -> residual)
-  uvg_generate_residual(ref_in, pred_in, residual, width, in_stride, in_stride);
+  uvg_generate_residual(ref_in, pred_in, residual, width, height, in_stride, in_stride);
 
   if (state->tile->frame->lmcs_aps->m_sliceReshapeInfo.enableChromaAdj && color != COLOR_Y) {
     int y, x;
@@ -662,10 +662,10 @@ int uvg_quantize_residual_avx2(encoder_state_t *const state,
 
   // Transform residual. (residual -> coeff)
   if (use_trskip) {
-    uvg_transformskip(state->encoder_control, residual, coeff, width);
+    uvg_transformskip(state->encoder_control, residual, coeff, width, height);
   }
   else {
-    uvg_transform2d(state->encoder_control, residual, coeff, width, color, cur_cu);
+    uvg_transform2d(state->encoder_control, residual, coeff, width, height, color, cur_cu);
   }
 
   const uint16_t lfnst_index = color == COLOR_Y ? cur_cu->lfnst_idx : cur_cu->cr_lfnst_idx;
diff --git a/src/strategies/generic/dct-generic.c b/src/strategies/generic/dct-generic.c
index cd05a01f..00562737 100644
--- a/src/strategies/generic/dct-generic.c
+++ b/src/strategies/generic/dct-generic.c
@@ -739,6 +739,11 @@ static void idct_ ## n ## x ## n ## _generic(int8_t bitdepth, const int16_t *inp
   partial_butterfly_inverse_ ## n ## _generic(tmp, output, shift_2nd); \
 }
 
+static void dct_non_square_generic(int8_t bitdepth, const int16_t* input, int16_t* output)
+{
+  // ISP_TODO: non-square transform here
+}
+
 DCT_NXN_GENERIC(4);
 DCT_NXN_GENERIC(8);
 DCT_NXN_GENERIC(16);
@@ -2487,26 +2492,28 @@ static void mts_dct_generic(
   const color_t color,
   const cu_info_t* tu,
   const int8_t width,
+  const int8_t height,
   const int16_t* input,
   int16_t* output,
   const int8_t mts_idx)
 {
   tr_type_t type_hor;
   tr_type_t type_ver;
+  // ISP_TODO: height passed but not used
 
   uvg_get_tr_type(width, color, tu, &type_hor, &type_ver, mts_idx);
 
-  if (type_hor == DCT2 && type_ver == DCT2 && !tu->lfnst_idx && !tu->cr_lfnst_idx)
+  if (type_hor == DCT2 && type_ver == DCT2 && !tu->lfnst_idx && !tu->cr_lfnst_idx || width != height)
   {
-    dct_func *dct_func = uvg_get_dct_func(width, color, tu->type);
+    dct_func *dct_func = uvg_get_dct_func(width, height, color, tu->type);
     dct_func(bitdepth, input, output);
   }
   else
   {
-    const int height = width;
     int skip_width = (type_hor != DCT2 && width == 32) ? 16 : (width > 32 ? width - 32 : 0);
     int skip_height = (type_ver != DCT2 && height == 32) ? 16 : (height > 32 ? height - 32 : 0);
     const int log2_width_minus2 = uvg_g_convert_to_bit[width];
+    const int log2_height_minus2 = uvg_g_convert_to_bit[height];
     if(tu->lfnst_idx || tu->cr_lfnst_idx) {
       if ((width == 4 && height > 4) || (width > 4 && height == 4))
       {
@@ -2521,11 +2528,11 @@ static void mts_dct_generic(
     }
 
     partial_tr_func* dct_hor = dct_table[type_hor][log2_width_minus2];
-    partial_tr_func* dct_ver = dct_table[type_ver][log2_width_minus2];
+    partial_tr_func* dct_ver = dct_table[type_ver][log2_height_minus2];
 
     int16_t tmp[32 * 32];
     const int32_t shift_1st = log2_width_minus2 + bitdepth - 7;
-    const int32_t shift_2nd = log2_width_minus2 + 8;
+    const int32_t shift_2nd = log2_height_minus2 + 8;
 
     dct_hor(input, tmp, shift_1st, height, 0, skip_width);
     dct_ver(tmp, output, shift_2nd, width, skip_width, skip_height);
@@ -2582,6 +2589,7 @@ int uvg_strategy_register_dct_generic(void* opaque, uint8_t bitdepth)
   success &= uvg_strategyselector_register(opaque, "dct_8x8", "generic", 0, &dct_8x8_generic);
   success &= uvg_strategyselector_register(opaque, "dct_16x16", "generic", 0, &dct_16x16_generic);
   success &= uvg_strategyselector_register(opaque, "dct_32x32", "generic", 0, &dct_32x32_generic);
+  success &= uvg_strategyselector_register(opaque, "dct_non_square", "generic", 0, &dct_non_square_generic);
 
   success &= uvg_strategyselector_register(opaque, "fast_inverse_dst_4x4", "generic", 0, &fast_inverse_dst_4x4_generic);
 
diff --git a/src/strategies/generic/picture-generic.c b/src/strategies/generic/picture-generic.c
index 817befed..6797a669 100644
--- a/src/strategies/generic/picture-generic.c
+++ b/src/strategies/generic/picture-generic.c
@@ -783,10 +783,10 @@ static double pixel_var_generic(const uvg_pixel *arr, const uint32_t len)
 
 
 static void generate_residual_generic(const uvg_pixel* ref_in, const uvg_pixel* pred_in, int16_t* residual, 
-  int width, int ref_stride, int pred_stride)
+  int width, int height, int ref_stride, int pred_stride)
 {
   int y, x;
-  for (y = 0; y < width; ++y) {
+  for (y = 0; y < height; ++y) {
     for (x = 0; x < width; ++x) {
       residual[x + y * width] = (int16_t)(ref_in[x + y * ref_stride] - pred_in[x + y * pred_stride]);
     }
diff --git a/src/strategies/generic/quant-generic.c b/src/strategies/generic/quant-generic.c
index 96d2567a..03d4daf8 100644
--- a/src/strategies/generic/quant-generic.c
+++ b/src/strategies/generic/quant-generic.c
@@ -237,6 +237,7 @@ int uvg_quant_cbcr_residual_generic(
   encoder_state_t* const state, 
   const cu_info_t* const cur_cu,
   const int width,
+  const int height,
   const coeff_scan_order_t scan_order,
   const int in_stride, const int out_stride,
   const uvg_pixel* const u_ref_in, 
@@ -247,28 +248,28 @@ int uvg_quant_cbcr_residual_generic(
   uvg_pixel* v_rec_out,
   coeff_t* coeff_out,
   bool early_skip, 
-  int lmcs_chroma_adj, enum uvg_tree_type tree_type
-  ) {
+  int lmcs_chroma_adj, enum uvg_tree_type tree_type) 
+{
   ALIGNED(64) int16_t u_residual[TR_MAX_WIDTH * TR_MAX_WIDTH];
   ALIGNED(64) int16_t v_residual[TR_MAX_WIDTH * TR_MAX_WIDTH];
   ALIGNED(64) int16_t combined_residual[TR_MAX_WIDTH * TR_MAX_WIDTH];
   ALIGNED(64) coeff_t coeff[TR_MAX_WIDTH * TR_MAX_WIDTH];
-
+  // ISP_TODO: this function is not fully converted to handle non-square blocks
   {
     int y, x;
-    for (y = 0; y < width; ++y) {
+    for (y = 0; y < height; ++y) {
       for (x = 0; x < width; ++x) {
         u_residual[x + y * width] = (int16_t)(u_ref_in[x + y * in_stride] - u_pred_in[x + y * in_stride]);
         v_residual[x + y * width] = (int16_t)(v_ref_in[x + y * in_stride] - v_pred_in[x + y * in_stride]);
       }
     }
   }
-  uvg_generate_residual(u_ref_in, u_pred_in, u_residual, width, in_stride, in_stride);
-  uvg_generate_residual(v_ref_in, v_pred_in, v_residual, width, in_stride, in_stride);
+  uvg_generate_residual(u_ref_in, u_pred_in, u_residual, width, height, in_stride, in_stride);
+  uvg_generate_residual(v_ref_in, v_pred_in, v_residual, width, height, in_stride, in_stride);
   
   
   const int cbf_mask = cur_cu->joint_cb_cr * (state->frame->jccr_sign ? -1 : 1);
-  for (int y = 0; y < width; y++)
+  for (int y = 0; y < height; y++)
   {
     for (int x = 0; x < width; x++)
     {
@@ -305,9 +306,9 @@ int uvg_quant_cbcr_residual_generic(
   }
 
 
-  uvg_transform2d(state->encoder_control, combined_residual, coeff, width, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U, cur_cu);
+  uvg_transform2d(state->encoder_control, combined_residual, coeff, width, height, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U, cur_cu);
   if(cur_cu->cr_lfnst_idx) {
-    uvg_fwd_lfnst(cur_cu, width, width, COLOR_UV, cur_cu->cr_lfnst_idx, coeff, tree_type);
+    uvg_fwd_lfnst(cur_cu, width, height, COLOR_UV, cur_cu->cr_lfnst_idx, coeff, tree_type);
   }
 
   if (state->encoder_control->cfg.rdoq_enable &&
@@ -441,7 +442,7 @@ int uvg_quant_cbcr_residual_generic(
 * \returns  Whether coeff_out contains any non-zero coefficients.
 */
 int uvg_quantize_residual_generic(encoder_state_t *const state,
-  const cu_info_t *const cur_cu, const int width, const color_t color,
+  const cu_info_t *const cur_cu, const int width, const int height, const color_t color,
   const coeff_scan_order_t scan_order, const int use_trskip,
   const int in_stride, const int out_stride,
   const uvg_pixel *const ref_in, const uvg_pixel *const pred_in,
@@ -454,19 +455,17 @@ int uvg_quantize_residual_generic(encoder_state_t *const state,
 
   int has_coeffs = 0;
 
-  assert(width <= TR_MAX_WIDTH);
-  assert(width >= TR_MIN_WIDTH);
-
-  const int height = width; // TODO: height for non-square blocks
+  assert(width <= TR_MAX_WIDTH && height <= TR_MAX_WIDTH);
+  assert(width >= TR_MIN_WIDTH && height >= TR_MIN_WIDTH);
 
   // Get residual. (ref_in - pred_in -> residual)
-  uvg_generate_residual(ref_in, pred_in, residual, width, in_stride, in_stride);
+  uvg_generate_residual(ref_in, pred_in, residual, width, height, in_stride, in_stride);
 
   if (state->tile->frame->lmcs_aps->m_sliceReshapeInfo.enableChromaAdj && color != COLOR_Y) {
     int y, x;
     int sign, absval;
     int maxAbsclipBD = (1 << UVG_BIT_DEPTH) - 1;
-    for (y = 0; y < width; ++y) {
+    for (y = 0; y < height; ++y) {
       for (x = 0; x < width; ++x) {
         sign = residual[x + y * width] >= 0 ? 1 : -1;
         absval = sign * residual[x + y * width];
@@ -477,10 +476,10 @@ int uvg_quantize_residual_generic(encoder_state_t *const state,
 
   // Transform residual. (residual -> coeff)
   if (use_trskip) {
-    uvg_transformskip(state->encoder_control, residual, coeff, width);
+    uvg_transformskip(state->encoder_control, residual, coeff, width, height);
   }
   else {
-    uvg_transform2d(state->encoder_control, residual, coeff, width, color, cur_cu);
+    uvg_transform2d(state->encoder_control, residual, coeff, width, height, color, cur_cu);
   }
 
   const uint8_t lfnst_index = color == COLOR_Y ? cur_cu->lfnst_idx : cur_cu->cr_lfnst_idx;
diff --git a/src/strategies/generic/quant-generic.h b/src/strategies/generic/quant-generic.h
index da2b05ae..ba1fa130 100644
--- a/src/strategies/generic/quant-generic.h
+++ b/src/strategies/generic/quant-generic.h
@@ -60,7 +60,7 @@ void uvg_quant_generic(
   uint8_t lfnst_idx);
 
 int uvg_quantize_residual_generic(encoder_state_t *const state,
-  const cu_info_t *const cur_cu, const int width, const color_t color,
+  const cu_info_t *const cur_cu, const int width, const int height, const color_t color,
   const coeff_scan_order_t scan_order, const int use_trskip,
   const int in_stride, const int out_stride,
   const uvg_pixel *const ref_in, const uvg_pixel *const pred_in,
@@ -71,6 +71,7 @@ int uvg_quant_cbcr_residual_generic(
   encoder_state_t* const state,
   const cu_info_t* const cur_cu,
   const int width,
+  const int height,
   const coeff_scan_order_t scan_order,
   const int in_stride, const int out_stride,
   const uvg_pixel* const u_ref_in,
diff --git a/src/strategies/strategies-dct.c b/src/strategies/strategies-dct.c
index 4ba2a37b..07f0fcb4 100644
--- a/src/strategies/strategies-dct.c
+++ b/src/strategies/strategies-dct.c
@@ -44,6 +44,7 @@ dct_func * uvg_dct_4x4 = 0;
 dct_func * uvg_dct_8x8 = 0;
 dct_func * uvg_dct_16x16 = 0;
 dct_func * uvg_dct_32x32 = 0;
+dct_func * uvg_dct_non_square = 0;
 
 dct_func * uvg_fast_inverse_dst_4x4 = 0;
 
@@ -56,9 +57,11 @@ void(*uvg_mts_dct)(int8_t bitdepth,
   color_t color,
   const cu_info_t *tu,
   int8_t width,
+  int8_t height,
   const int16_t *input,
   int16_t *output,
   const int8_t mts_idx);
+
 void(*uvg_mts_idct)(int8_t bitdepth,
   color_t color,
   const cu_info_t *tu,
@@ -90,8 +93,13 @@ int uvg_strategy_register_dct(void* opaque, uint8_t bitdepth) {
  *
  * \returns Pointer to the function.
  */
-dct_func * uvg_get_dct_func(int8_t width, color_t color, cu_type_t type)
+dct_func * uvg_get_dct_func(int8_t width, int8_t height, color_t color, cu_type_t type)
 {
+  if (width != height) {
+    // Non-square block. Return generic dct for non-square blokcs.
+    assert(false && "This should never be called at this point. Non-square stuff is done inside mts_dct function.");
+    return uvg_dct_non_square;
+  }
   switch (width) {
   case 4:
     //if (color == COLOR_Y && type == CU_INTRA) {
diff --git a/src/strategies/strategies-dct.h b/src/strategies/strategies-dct.h
index d58bf5a9..50cc3b5a 100644
--- a/src/strategies/strategies-dct.h
+++ b/src/strategies/strategies-dct.h
@@ -51,6 +51,7 @@ extern dct_func * uvg_dct_4x4;
 extern dct_func * uvg_dct_8x8;
 extern dct_func * uvg_dct_16x16;
 extern dct_func * uvg_dct_32x32;
+extern dct_func * uvg_dct_non_square;
 
 extern dct_func * uvg_fast_inverse_dst_4x4;
 
@@ -64,6 +65,7 @@ typedef void (mts_dct_func)(
   color_t color,
   const cu_info_t* tu,
   int8_t width,
+  int8_t height,
   const int16_t* input,
   int16_t* output,
   const int8_t mts_idx);
@@ -82,7 +84,7 @@ typedef void (mts_idct_func)(
 extern mts_idct_func* uvg_mts_idct;
 
 int uvg_strategy_register_dct(void* opaque, uint8_t bitdepth);
-dct_func * uvg_get_dct_func(int8_t width, color_t color, cu_type_t type);
+dct_func * uvg_get_dct_func(int8_t width, int8_t height, color_t color, cu_type_t type);
 dct_func * uvg_get_idct_func(int8_t width, color_t color, cu_type_t type);
 
 
diff --git a/src/strategies/strategies-picture.h b/src/strategies/strategies-picture.h
index 88f52cfc..8d73f74c 100644
--- a/src/strategies/strategies-picture.h
+++ b/src/strategies/strategies-picture.h
@@ -149,7 +149,7 @@ typedef void (inter_recon_bipred_func)(lcu_t * const lcu,
 
 typedef double (pixel_var_func)(const uvg_pixel *buf, const uint32_t len);
 
-typedef void (generate_residual_func)(const uvg_pixel* ref_in, const uvg_pixel* pred_in, int16_t* residual, int width, int ref_stride, int pred_stride);
+typedef void (generate_residual_func)(const uvg_pixel* ref_in, const uvg_pixel* pred_in, int16_t* residual, int width, int height, int ref_stride, int pred_stride);
 
 
 extern const uint32_t uvg_crc_table[256];
diff --git a/src/strategies/strategies-quant.h b/src/strategies/strategies-quant.h
index a6c9a3d4..2920ed82 100644
--- a/src/strategies/strategies-quant.h
+++ b/src/strategies/strategies-quant.h
@@ -45,12 +45,23 @@
 #include "tables.h"
 
 // Declare function pointers.
-typedef unsigned (quant_func)(const encoder_state_t * const state, coeff_t *coef, coeff_t *q_coef, int32_t width,
-  int32_t height, color_t color, int8_t scan_idx, int8_t block_type, int8_t transform_skip, uint8_t lfnst_idx);
+typedef unsigned (quant_func)(
+  const encoder_state_t * const state, 
+  coeff_t *coef, 
+  coeff_t *q_coef, 
+  int32_t width,
+  int32_t height, 
+  color_t color, 
+  int8_t scan_idx, 
+  int8_t block_type, 
+  int8_t transform_skip, 
+  uint8_t lfnst_idx);
+
 typedef unsigned (quant_cbcr_func)(
   encoder_state_t* const state,
   const cu_info_t* const cur_cu,
   const int width,
+  const int height,
   const coeff_scan_order_t scan_order,
   const int in_stride, const int out_stride,
   const uvg_pixel* const u_ref_in,
@@ -63,15 +74,18 @@ typedef unsigned (quant_cbcr_func)(
   bool early_skip,
   int lmcs_chroma_adj, 
   enum uvg_tree_type tree_type);
+
 typedef unsigned (quant_residual_func)(encoder_state_t *const state,
-  const cu_info_t *const cur_cu, const int width, const color_t color,
+  const cu_info_t *const cur_cu, const int width, const int height, const color_t color,
   const coeff_scan_order_t scan_order, const int use_trskip,
   const int in_stride, const int out_stride,
   const uvg_pixel *const ref_in, const uvg_pixel *const pred_in,
   uvg_pixel *rec_out, coeff_t *coeff_out,
   bool early_skip, int lmcs_chroma_adj, enum uvg_tree_type tree_type);
+
 typedef unsigned (dequant_func)(const encoder_state_t * const state, coeff_t *q_coef, coeff_t *coef, int32_t width,
   int32_t height, color_t color, int8_t block_type, int8_t transform_skip);
+
 typedef uint32_t (fast_coeff_cost_func)(const coeff_t *coeff, int32_t width, uint64_t weights);
 
 typedef uint32_t (coeff_abs_sum_func)(const coeff_t *coeffs, size_t length);
diff --git a/src/transform.c b/src/transform.c
index abf793c2..0f73eeeb 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -77,6 +77,7 @@ const uint8_t uvg_g_chroma_scale[58]=
  * Parameters pred_in and rec_out may be aliased.
  *
  * \param width       Transform width.
+ * \param height      Transform height.
  * \param in_stride   Stride for ref_in and pred_in
  * \param out_stride  Stride for rec_out.
  * \param ref_in      Reference pixels.
@@ -87,6 +88,7 @@ const uint8_t uvg_g_chroma_scale[58]=
  * \returns  Whether coeff_out contains any non-zero coefficients.
  */
 static bool bypass_transquant(const int width,
+                              const int height,
                               const int in_stride,
                               const int out_stride,
                               const uvg_pixel *const ref_in,
@@ -96,7 +98,7 @@ static bool bypass_transquant(const int width,
 {
   bool nonzero_coeffs = false;
 
-  for (int y = 0; y < width; ++y) {
+  for (int y = 0; y < height; ++y) {
     for (int x = 0; x < width; ++x) {
       int32_t in_idx    = x + y * in_stride;
       int32_t out_idx   = x + y * out_stride;
@@ -123,6 +125,7 @@ static bool bypass_transquant(const int width,
  * \param coeff   coefficients (residual) to filter
  */
 static void rdpcm(const int width,
+                  const int height,
                   const rdpcm_dir dir,
                   coeff_t *coeff)
 {
@@ -130,7 +133,7 @@ static void rdpcm(const int width,
   const int min_x  = (dir == RDPCM_HOR) ? 1 : 0;
   const int min_y  = (dir == RDPCM_HOR) ? 0 : 1;
 
-  for (int y = width - 1; y >= min_y; y--) {
+  for (int y = height - 1; y >= min_y; y--) {
     for (int x = width - 1; x >= min_x; x--) {
       const int index = x + y * width;
       coeff[index] -= coeff[index - offset];
@@ -203,17 +206,18 @@ void uvg_derive_lfnst_constraints(
 
 /**
  * \brief NxN inverse transform (2D)
- * \param coeff input data (transform coefficients)
- * \param block output data (residual)
- * \param block_size input data (width of transform)
+ * \param coeff   input data (transform coefficients)
+ * \param block   output data (residual)
+ * \param width   transform width
+ * \param height  transform height
  */
-void uvg_transformskip(const encoder_control_t * const encoder, int16_t *block,int16_t *coeff, int8_t block_size)
+void uvg_transformskip(const encoder_control_t * const encoder, int16_t *block,int16_t *coeff, int8_t width, int8_t height)
 {
-  int32_t  j,k;
-  for (j = 0; j < block_size; j++) {
-    for(k = 0; k < block_size; k ++) {
+  int32_t j, k;
+  for (j = 0; j < height; j++) {
+    for(k = 0; k < width; k ++) {
       // Casting back and forth to make UBSan not trigger due to left-shifting negatives
-      coeff[j * block_size + k] = (int16_t)((uint16_t)(block[j * block_size + k]));
+      coeff[j * width + k] = (int16_t)((uint16_t)(block[j * width + k]));
     }
   }
 }
@@ -243,17 +247,18 @@ void uvg_itransformskip(const encoder_control_t * const encoder, int16_t *block,
 void uvg_transform2d(const encoder_control_t * const encoder,
                      int16_t *block,
                      int16_t *coeff,
-                     int8_t block_size,
+                     int8_t block_width,
+                     int8_t block_height,
                      color_t color,
                      const cu_info_t *tu)
 {
-  if (encoder->cfg.mts || tu->lfnst_idx || tu->cr_lfnst_idx)
+  if (encoder->cfg.mts || tu->lfnst_idx || tu->cr_lfnst_idx || block_width != block_height)
   {
-    uvg_mts_dct(encoder->bitdepth, color, tu, block_size, block, coeff, encoder->cfg.mts);
+    uvg_mts_dct(encoder->bitdepth, color, tu, block_width, block_height, block, coeff, encoder->cfg.mts);
   }
   else
   {
-    dct_func *dct_func = uvg_get_dct_func(block_size, color, tu->type);
+    dct_func *dct_func = uvg_get_dct_func(block_width, block_height, color, tu->type);
     dct_func(encoder->bitdepth, block, coeff);
   }
 }
@@ -373,6 +378,7 @@ static void generate_jccr_transforms(
       &temp_resi[(cbf_mask1 - 1) * trans_offset],
       &u_coeff[*num_transforms * trans_offset],
       width,
+      height,
       COLOR_U,
       pred_cu
     );
@@ -386,6 +392,7 @@ static void generate_jccr_transforms(
       &temp_resi[(cbf_mask2 - 1) * trans_offset],
       &u_coeff[*num_transforms * trans_offset],
       width,
+      height,
       COLOR_U,
       pred_cu
     );
@@ -492,10 +499,10 @@ void uvg_chroma_transform_search(
   ALIGNED(64) coeff_t v_coeff[LCU_WIDTH_C * LCU_WIDTH_C * 2];
   ALIGNED(64) uint8_t v_recon[LCU_WIDTH_C * LCU_WIDTH_C * 5];
   uvg_transform2d(
-    state->encoder_control, u_resi, u_coeff, width, COLOR_U, pred_cu
+    state->encoder_control, u_resi, u_coeff, width, height, COLOR_U, pred_cu
   );
   uvg_transform2d(
-    state->encoder_control, v_resi, v_coeff, width, COLOR_V, pred_cu
+    state->encoder_control, v_resi, v_coeff, width, height, COLOR_V, pred_cu
   );
   enum uvg_chroma_transforms transforms[5];
   transforms[0] = DCT7_CHROMA;
@@ -508,8 +515,8 @@ void uvg_chroma_transform_search(
     pred_cu->cr_lfnst_idx == 0 ;
 
   if (can_use_tr_skip) {
-    uvg_transformskip(state->encoder_control, u_resi, u_coeff + num_transforms * trans_offset, width);
-    uvg_transformskip(state->encoder_control, v_resi, v_coeff + num_transforms * trans_offset, width);
+    uvg_transformskip(state->encoder_control, u_resi, u_coeff + num_transforms * trans_offset, width, height);
+    uvg_transformskip(state->encoder_control, v_resi, v_coeff + num_transforms * trans_offset, width, height);
     transforms[num_transforms] = CHROMA_TS;
     num_transforms++;
   }
@@ -1053,7 +1060,7 @@ void uvg_inv_lfnst(
  */
 int uvg_quantize_residual_trskip(
     encoder_state_t *const state,
-    const cu_info_t *const cur_cu, const int width, const color_t color,
+    const cu_info_t *const cur_cu, const int width, const int height, const color_t color,
     const coeff_scan_order_t scan_order, int8_t *trskip_out, 
     const int in_stride, const int out_stride,
     const uvg_pixel *const ref_in, const uvg_pixel *const pred_in, 
@@ -1074,7 +1081,7 @@ int uvg_quantize_residual_trskip(
   //noskip.cost += uvg_get_coeff_cost(state, noskip.coeff, 4, 0, scan_order) * bit_cost;
 
   skip.has_coeffs = uvg_quantize_residual(
-    state, cur_cu, width, color, scan_order,
+    state, cur_cu, width, height, color, scan_order,
     1, in_stride, width,
     ref_in, pred_in, skip.rec, skip.coeff, false, lmcs_chroma_adj, 
     UVG_BOTH_T /* tree type doesn't matter for transformskip*/);
@@ -1090,9 +1097,9 @@ int uvg_quantize_residual_trskip(
   if (best->has_coeffs || rec_out != pred_in) {
     // If there is no residual and reconstruction is already in rec_out, 
     // we can skip this.
-    uvg_pixels_blit(best->rec, rec_out, width, width, width, out_stride);
+    uvg_pixels_blit(best->rec, rec_out, width, height, width, out_stride);
   }
-  copy_coeffs(best->coeff, coeff_out, width);
+  copy_coeffs(best->coeff, coeff_out, width, height);
 
   return best->has_coeffs;
 }
@@ -1131,8 +1138,8 @@ static void quantize_tr_residual(
   // This should ensure that the CBF data doesn't get corrupted if this function
   // is called more than once.
 
-  int32_t tr_width  = color == COLOR_Y ? cu_loc->width  : cu_loc->chroma_width;
-  int32_t tr_height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
+  const int32_t tr_width  = color == COLOR_Y ? cu_loc->width  : cu_loc->chroma_width;
+  const int32_t tr_height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
   
   const int32_t lcu_width = LCU_WIDTH >> shift;
   const int8_t mode =
@@ -1183,7 +1190,9 @@ static void quantize_tr_residual(
   }
 
   if (cfg->lossless) {
+    // ISP_TODO: is there any sensible case where in and out strides would be different?
     has_coeffs = bypass_transquant(tr_width,
+                                   tr_height,
                                    lcu_width, // in stride
                                    lcu_width, // out stride
                                    ref,
@@ -1193,9 +1202,9 @@ static void quantize_tr_residual(
     if (cfg->implicit_rdpcm && cur_pu->type == CU_INTRA) {
       // implicit rdpcm for horizontal and vertical intra modes
       if (mode == 18) {
-        rdpcm(tr_width, RDPCM_HOR, coeff);
+        rdpcm(tr_width, tr_height, RDPCM_HOR, coeff);
       } else if (mode == 50) {
-        rdpcm(tr_width, RDPCM_VER, coeff);
+        rdpcm(tr_width, tr_height, RDPCM_VER, coeff);
       }
     }
 
@@ -1206,6 +1215,7 @@ static void quantize_tr_residual(
     has_coeffs = uvg_quantize_residual_trskip(state,
                                               cur_pu,
                                               tr_width,
+                                              tr_height,
                                               color,
                                               scan_idx,
                                               &tr_skip,
@@ -1222,6 +1232,7 @@ static void quantize_tr_residual(
         state,
         cur_pu,
         tr_width,
+        tr_height,
         scan_idx,
         lcu_width,
         lcu_width,
@@ -1240,6 +1251,7 @@ static void quantize_tr_residual(
     has_coeffs = uvg_quantize_residual(state,
                                        cur_pu,
                                        tr_width,
+                                       tr_height,
                                        color,
                                        scan_idx,
                                        false, // tr skip
@@ -1326,8 +1338,8 @@ void uvg_quantize_lcu_residual(
     const int offset = width / 2;
     for (int j = 0; j < 2; ++j) {
       for (int i = 0; i < 2; ++i) {
-        const cu_loc_t loc;
-        uvg_cu_loc_ctor(&loc, (x + i * offset), (y + j * offset), width, height);
+        cu_loc_t loc;
+        uvg_cu_loc_ctor(&loc, (x + i * offset), (y + j * offset), width >> 1, height >> 1);
         // jccr is currently not supported if transform is split
         uvg_quantize_lcu_residual(state, luma, chroma, 0, &loc, depth + 1, NULL, lcu, early_skip, tree_type);
       }
diff --git a/src/transform.h b/src/transform.h
index 61c50c04..a7b6e221 100644
--- a/src/transform.h
+++ b/src/transform.h
@@ -47,13 +47,14 @@ extern const uint8_t uvg_g_chroma_scale[58];
 extern const int16_t uvg_g_inv_quant_scales[6];
 extern const int16_t uvg_g_quant_scales[6];
 
-void uvg_transformskip(const encoder_control_t *encoder, int16_t *block,int16_t *coeff, int8_t block_size);
+void uvg_transformskip(const encoder_control_t *encoder, int16_t *block,int16_t *coeff, int8_t width, int8_t height);
 void uvg_itransformskip(const encoder_control_t *encoder, int16_t *block,int16_t *coeff, int8_t block_size);
 
 void uvg_transform2d(const encoder_control_t * const encoder,
                      int16_t *block,
                      int16_t *coeff,
-                     int8_t block_size,
+                     int8_t block_width,
+                     int8_t block_height,
                      color_t color,
                      const cu_info_t *tu);
 
diff --git a/tests/mts_tests.c b/tests/mts_tests.c
index f607b77d..2a132c77 100644
--- a/tests/mts_tests.c
+++ b/tests/mts_tests.c
@@ -111,7 +111,7 @@ static void setup_tests()
           tu.tr_idx = MTS_DST7_DST7 + trafo;
           tu.lfnst_idx = 0;
           tu.cr_lfnst_idx = 0;
-          mts_generic(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + block), dct_bufs[trafo*NUM_SIZES+block], dct_result[trafo][block], UVG_MTS_BOTH);
+          mts_generic(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + block), 1 << (LCU_MIN_LOG_W + block), dct_bufs[trafo*NUM_SIZES+block], dct_result[trafo][block], UVG_MTS_BOTH);
         }
       }      
     }
@@ -167,7 +167,7 @@ TEST dct(void)
       int16_t* buf = dct_bufs[trafo * NUM_SIZES + blocksize];
       ALIGNED(32) int16_t test_result[LCU_WIDTH * LCU_WIDTH] = { 0 };
 
-      test_env.tested_func(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + blocksize), buf, test_result, UVG_MTS_BOTH);
+      test_env.tested_func(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + blocksize), 1 << (LCU_MIN_LOG_W + blocksize), buf, test_result, UVG_MTS_BOTH);
 
       for (int i = 0; i < LCU_WIDTH * LCU_WIDTH; ++i) {
         ASSERT_EQm(testname, test_result[i], dct_result[trafo][blocksize][i]);
@@ -192,7 +192,7 @@ TEST idct(void)
       int16_t* buf = dct_bufs[trafo * NUM_SIZES + blocksize];
       ALIGNED(32) int16_t test_result[LCU_WIDTH * LCU_WIDTH] = { 0 };
 
-      test_env.tested_func(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + blocksize), buf, test_result, UVG_MTS_BOTH);
+      test_env.tested_func(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + blocksize), 1 << (LCU_MIN_LOG_W + blocksize), buf, test_result, UVG_MTS_BOTH);
 
       for (int i = 0; i < LCU_WIDTH * LCU_WIDTH; ++i) {
         ASSERT_EQm(testname, test_result[i], idct_result[trafo][blocksize][i]);

From 6a3ddfd0bcc90e31d6b6e6b6bb6687d47bee0259 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Fri, 5 Aug 2022 10:13:24 +0300
Subject: [PATCH 008/254] [isp] Modify inverse transform to handle non-square
 blocks.

---
 src/strategies/avx2/quant-avx2.c       |  2 +-
 src/strategies/generic/dct-generic.c   | 28 +++++++++++++++++++-------
 src/strategies/generic/quant-generic.c |  4 ++--
 src/strategies/strategies-dct.c        |  1 +
 src/strategies/strategies-dct.h        |  1 +
 src/transform.c                        | 12 ++++++-----
 src/transform.h                        |  3 ++-
 tests/mts_tests.c                      |  2 +-
 8 files changed, 36 insertions(+), 17 deletions(-)

diff --git a/src/strategies/avx2/quant-avx2.c b/src/strategies/avx2/quant-avx2.c
index 8313b1f0..664933a8 100644
--- a/src/strategies/avx2/quant-avx2.c
+++ b/src/strategies/avx2/quant-avx2.c
@@ -716,7 +716,7 @@ int uvg_quantize_residual_avx2(encoder_state_t *const state,
       uvg_itransformskip(state->encoder_control, residual, coeff, width);
     }
     else {
-      uvg_itransform2d(state->encoder_control, residual, coeff, width, color, cur_cu);
+      uvg_itransform2d(state->encoder_control, residual, coeff, width, height, color, cur_cu);
     }
 
     if (state->tile->frame->lmcs_aps->m_sliceReshapeInfo.enableChromaAdj && color != COLOR_Y) {
diff --git a/src/strategies/generic/dct-generic.c b/src/strategies/generic/dct-generic.c
index 00562737..edba54f5 100644
--- a/src/strategies/generic/dct-generic.c
+++ b/src/strategies/generic/dct-generic.c
@@ -2499,8 +2499,8 @@ static void mts_dct_generic(
 {
   tr_type_t type_hor;
   tr_type_t type_ver;
-  // ISP_TODO: height passed but not used
 
+  // ISP_TODO: height
   uvg_get_tr_type(width, color, tu, &type_hor, &type_ver, mts_idx);
 
   if (type_hor == DCT2 && type_ver == DCT2 && !tu->lfnst_idx && !tu->cr_lfnst_idx || width != height)
@@ -2514,6 +2514,7 @@ static void mts_dct_generic(
     int skip_height = (type_ver != DCT2 && height == 32) ? 16 : (height > 32 ? height - 32 : 0);
     const int log2_width_minus2 = uvg_g_convert_to_bit[width];
     const int log2_height_minus2 = uvg_g_convert_to_bit[height];
+
     if(tu->lfnst_idx || tu->cr_lfnst_idx) {
       if ((width == 4 && height > 4) || (width > 4 && height == 4))
       {
@@ -2545,6 +2546,7 @@ static void mts_idct_generic(
   const color_t color,
   const cu_info_t* tu,
   const int8_t width,
+  const int8_t height,
   const int16_t* input,
   int16_t* output,
   const int8_t mts_idx)
@@ -2552,26 +2554,38 @@ static void mts_idct_generic(
   tr_type_t type_hor;
   tr_type_t type_ver;
 
+  // ISP_TODO: height
   uvg_get_tr_type(width, color, tu, &type_hor, &type_ver, mts_idx);
 
-  if (type_hor == DCT2 && type_ver == DCT2)
+  if (type_hor == DCT2 && type_ver == DCT2 && !tu->lfnst_idx && !tu->cr_lfnst_idx || width != height)
   {
     dct_func *idct_func = uvg_get_idct_func(width, color, tu->type);
     idct_func(bitdepth, input, output);
   }
   else
   {
-    const int height = width;
-    const int skip_width = (type_hor != DCT2 && width == 32) ? 16 : width > 32 ? width - 32 : 0;
-    const int skip_height = (type_ver != DCT2 && height == 32) ? 16 : height > 32 ? height - 32 : 0;
+    int skip_width = (type_hor != DCT2 && width == 32) ? 16 : width > 32 ? width - 32 : 0;
+    int skip_height = (type_ver != DCT2 && height == 32) ? 16 : height > 32 ? height - 32 : 0;
     const int log2_width_minus2 = uvg_g_convert_to_bit[width];
+    const int log2_height_minus2 = uvg_g_convert_to_bit[height];
+
+    if (tu->lfnst_idx || tu->cr_lfnst_idx) {
+      if ((width == 4 && height > 4) || (width > 4 && height == 4)) {
+        skip_width == width - 4;
+        skip_height == height - 4;
+      }
+      else if ((width >= 8 && height >= 8)) {
+        skip_width = width - 8;
+        skip_height = height - 8;
+      }
+    }
 
     partial_tr_func* idct_hor = idct_table[type_hor][log2_width_minus2];
     partial_tr_func* idct_ver = idct_table[type_ver][log2_width_minus2];
 
     int16_t tmp[32 * 32];
-    const int32_t shift_1st = 7;
-    const int32_t shift_2nd = 20 - bitdepth;
+    const int32_t shift_1st = log2_width_minus2 - 7;
+    const int32_t shift_2nd = log2_height_minus2 + 8;
 
     idct_ver(input, tmp, shift_1st, width, skip_width, skip_height);
     idct_hor(tmp, output, shift_2nd, height, 0, skip_width);
diff --git a/src/strategies/generic/quant-generic.c b/src/strategies/generic/quant-generic.c
index 03d4daf8..11a63871 100644
--- a/src/strategies/generic/quant-generic.c
+++ b/src/strategies/generic/quant-generic.c
@@ -349,7 +349,7 @@ int uvg_quant_cbcr_residual_generic(
       uvg_inv_lfnst(cur_cu, width, width, COLOR_UV, cur_cu->cr_lfnst_idx, coeff, tree_type);
     }
     
-    uvg_itransform2d(state->encoder_control, combined_residual, coeff, width, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U, cur_cu);
+    uvg_itransform2d(state->encoder_control, combined_residual, coeff, width, height, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U, cur_cu);
     
 
     //if (state->tile->frame->lmcs_aps->m_sliceReshapeInfo.enableChromaAdj && color != COLOR_Y) {
@@ -537,7 +537,7 @@ int uvg_quantize_residual_generic(encoder_state_t *const state,
       uvg_itransformskip(state->encoder_control, residual, coeff, width);
     }
     else {
-      uvg_itransform2d(state->encoder_control, residual, coeff, width, color, cur_cu);
+      uvg_itransform2d(state->encoder_control, residual, coeff, width, height, color, cur_cu);
     }
     
     if (state->tile->frame->lmcs_aps->m_sliceReshapeInfo.enableChromaAdj && color != COLOR_Y) {
diff --git a/src/strategies/strategies-dct.c b/src/strategies/strategies-dct.c
index 07f0fcb4..e7cc37e9 100644
--- a/src/strategies/strategies-dct.c
+++ b/src/strategies/strategies-dct.c
@@ -66,6 +66,7 @@ void(*uvg_mts_idct)(int8_t bitdepth,
   color_t color,
   const cu_info_t *tu,
   int8_t width,
+  int8_t height,
   const int16_t *input,
   int16_t *output,
   const int8_t mts_idx);
diff --git a/src/strategies/strategies-dct.h b/src/strategies/strategies-dct.h
index 50cc3b5a..59e05084 100644
--- a/src/strategies/strategies-dct.h
+++ b/src/strategies/strategies-dct.h
@@ -77,6 +77,7 @@ typedef void (mts_idct_func)(
   color_t color,
   const cu_info_t* tu,
   int8_t width,
+  int8_t height,
   const int16_t* input,
   int16_t* output,
   const int8_t mts_idx);
diff --git a/src/transform.c b/src/transform.c
index 0f73eeeb..53d27de7 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -266,17 +266,19 @@ void uvg_transform2d(const encoder_control_t * const encoder,
 void uvg_itransform2d(const encoder_control_t * const encoder,
                       int16_t *block,
                       int16_t *coeff,
-                      int8_t block_size,
+                      int8_t block_width,
+                      int8_t block_height,
                       color_t color,
                       const cu_info_t *tu)
 {
   if (encoder->cfg.mts)
   {
-    uvg_mts_idct(encoder->bitdepth, color, tu, block_size, coeff, block, encoder->cfg.mts);
+    uvg_mts_idct(encoder->bitdepth, color, tu, block_width, block_height, coeff, block, encoder->cfg.mts);
   }
   else
   {
-    dct_func *idct_func = uvg_get_idct_func(block_size, color, tu->type);
+    // ISP_TODO: block height
+    dct_func *idct_func = uvg_get_idct_func(block_width, color, tu->type);
     idct_func(encoder->bitdepth, coeff, block);
   }
 }
@@ -590,7 +592,7 @@ void uvg_chroma_transform_search(
         if (pred_cu->cr_lfnst_idx) {
           uvg_inv_lfnst(pred_cu, width, height, COLOR_U, pred_cu->cr_lfnst_idx, &u_coeff[i * trans_offset], tree_type);
         }
-        uvg_itransform2d(state->encoder_control, u_recon_resi, &u_coeff[i * trans_offset], width,
+        uvg_itransform2d(state->encoder_control, u_recon_resi, &u_coeff[i * trans_offset], width, height,
           transforms[i] != JCCR_1 ? COLOR_U : COLOR_V, pred_cu);
       }
       else {
@@ -617,7 +619,7 @@ void uvg_chroma_transform_search(
         if (pred_cu->cr_lfnst_idx) {
           uvg_inv_lfnst(pred_cu, width, height, COLOR_V, pred_cu->cr_lfnst_idx, &v_coeff[i * trans_offset], tree_type);
         }
-        uvg_itransform2d(state->encoder_control, v_recon_resi, &v_coeff[i * trans_offset], width,
+        uvg_itransform2d(state->encoder_control, v_recon_resi, &v_coeff[i * trans_offset], width, height,
           transforms[i] != JCCR_1 ? COLOR_U : COLOR_V, pred_cu);
       }
       else {
diff --git a/src/transform.h b/src/transform.h
index a7b6e221..0da34a12 100644
--- a/src/transform.h
+++ b/src/transform.h
@@ -61,7 +61,8 @@ void uvg_transform2d(const encoder_control_t * const encoder,
 void uvg_itransform2d(const encoder_control_t * const encoder,
                       int16_t *block,
                       int16_t *coeff,
-                      int8_t block_size,
+                      int8_t block_width,
+                      int8_t block_height,
                       color_t color,
                       const cu_info_t *tu);
 
diff --git a/tests/mts_tests.c b/tests/mts_tests.c
index 2a132c77..e12de73e 100644
--- a/tests/mts_tests.c
+++ b/tests/mts_tests.c
@@ -134,7 +134,7 @@ static void setup_tests()
           tu.tr_idx = MTS_DST7_DST7 + trafo;
           tu.lfnst_idx = 0;
           tu.cr_lfnst_idx = 0;
-          idct_generic(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + block), dct_bufs[trafo * NUM_SIZES + block], idct_result[trafo][block], UVG_MTS_BOTH);
+          idct_generic(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + block), 1 << (LCU_MIN_LOG_W + block), dct_bufs[trafo * NUM_SIZES + block], idct_result[trafo][block], UVG_MTS_BOTH);
         }
       }
       

From cd7e091992f97522329886dfd7c819e56417b913 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Fri, 5 Aug 2022 13:48:35 +0300
Subject: [PATCH 009/254] [isp] Fix mistake in transform if clause.

---
 src/strategies/generic/dct-generic.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/strategies/generic/dct-generic.c b/src/strategies/generic/dct-generic.c
index edba54f5..9e410541 100644
--- a/src/strategies/generic/dct-generic.c
+++ b/src/strategies/generic/dct-generic.c
@@ -2503,7 +2503,7 @@ static void mts_dct_generic(
   // ISP_TODO: height
   uvg_get_tr_type(width, color, tu, &type_hor, &type_ver, mts_idx);
 
-  if (type_hor == DCT2 && type_ver == DCT2 && !tu->lfnst_idx && !tu->cr_lfnst_idx || width != height)
+  if (type_hor == DCT2 && type_ver == DCT2 && !tu->lfnst_idx && !tu->cr_lfnst_idx || width == height)
   {
     dct_func *dct_func = uvg_get_dct_func(width, height, color, tu->type);
     dct_func(bitdepth, input, output);
@@ -2557,7 +2557,7 @@ static void mts_idct_generic(
   // ISP_TODO: height
   uvg_get_tr_type(width, color, tu, &type_hor, &type_ver, mts_idx);
 
-  if (type_hor == DCT2 && type_ver == DCT2 && !tu->lfnst_idx && !tu->cr_lfnst_idx || width != height)
+  if (type_hor == DCT2 && type_ver == DCT2 && !tu->lfnst_idx && !tu->cr_lfnst_idx || width == height)
   {
     dct_func *idct_func = uvg_get_idct_func(width, color, tu->type);
     idct_func(bitdepth, input, output);

From a9090c99b5f33fa2d0b40591e3f7973f70047d00 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Fri, 5 Aug 2022 14:16:32 +0300
Subject: [PATCH 010/254] [isp] Fix error in inverse transform shifting.

---
 src/strategies/generic/dct-generic.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/strategies/generic/dct-generic.c b/src/strategies/generic/dct-generic.c
index 9e410541..717f1b5f 100644
--- a/src/strategies/generic/dct-generic.c
+++ b/src/strategies/generic/dct-generic.c
@@ -2584,8 +2584,11 @@ static void mts_idct_generic(
     partial_tr_func* idct_ver = idct_table[type_ver][log2_width_minus2];
 
     int16_t tmp[32 * 32];
-    const int32_t shift_1st = log2_width_minus2 - 7;
-    const int32_t shift_2nd = log2_height_minus2 + 8;
+    const int max_log2_tr_dynamic_range = 15;
+    const int transform_matrix_shift = 6;
+
+    const int32_t shift_1st = transform_matrix_shift + 1;
+    const int32_t shift_2nd = (transform_matrix_shift + max_log2_tr_dynamic_range - 1) - bitdepth;
 
     idct_ver(input, tmp, shift_1st, width, skip_width, skip_height);
     idct_hor(tmp, output, shift_2nd, height, 0, skip_width);

From 35271648dbf8d2af31f94df2f7c3803de87c8116 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Mon, 8 Aug 2022 14:16:57 +0300
Subject: [PATCH 011/254] [isp] Fix some errors. Pass height to functions. Some
 WIP comments.

---
 src/rdo.c                              | 5 +++--
 src/strategies/generic/quant-generic.c | 7 ++++---
 src/transform.c                        | 4 +++-
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/rdo.c b/src/rdo.c
index f8ebacdf..aa78c697 100644
--- a/src/rdo.c
+++ b/src/rdo.c
@@ -1386,8 +1386,8 @@ void uvg_rdoq(
 {
   const encoder_control_t * const encoder = state->encoder_control;
   cabac_data_t * const cabac = &state->cabac;
-  uint32_t log2_tr_width      = uvg_math_floor_log2( height );
-  uint32_t log2_tr_height      = uvg_math_floor_log2( width );
+  uint32_t log2_tr_width      = uvg_math_floor_log2(width);
+  uint32_t log2_tr_height      = uvg_math_floor_log2(height);
   int32_t  transform_shift   = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_height + log2_tr_width) >> 1);  // Represents scaling through forward transform
   uint16_t go_rice_param     = 0;
   uint32_t reg_bins = (width * height * 28) >> 4;
@@ -1697,6 +1697,7 @@ void uvg_rdoq(
       default:
         assert(0);
     }
+    // ISP_TODO: does height affect ctx_cbf? Do this when fixing other cbf stuff
     ctx_cbf    = ( type != COLOR_V ? 0 : cbf_is_set(cbf, 5 - uvg_math_floor_log2(width), COLOR_U));
     best_cost  = block_uncoded_cost +  lambda * CTX_ENTROPY_BITS(&base_cbf_model[ctx_cbf],0);
     base_cost +=   lambda * CTX_ENTROPY_BITS(&base_cbf_model[ctx_cbf],1);
diff --git a/src/strategies/generic/quant-generic.c b/src/strategies/generic/quant-generic.c
index 11a63871..81bf0892 100644
--- a/src/strategies/generic/quant-generic.c
+++ b/src/strategies/generic/quant-generic.c
@@ -63,6 +63,7 @@ void uvg_quant_generic(
 {
   const encoder_control_t * const encoder = state->encoder_control;
   const uint32_t log2_block_size = uvg_g_convert_to_bit[width] + 2;
+  // ISP_TODO: width & height affect scan order
   const uint32_t * const scan = uvg_g_sig_last_scan[scan_idx][log2_block_size - 1];
 
   int32_t qp_scaled = uvg_get_scaled_qp(color, state->qp, (encoder->bitdepth - 8) * 6, encoder->qp_map[0]);
@@ -493,15 +494,15 @@ int uvg_quantize_residual_generic(encoder_state_t *const state,
   // Quantize coeffs. (coeff -> coeff_out)
   
   if (state->encoder_control->cfg.rdoq_enable &&
-      (width > 4 || !state->encoder_control->cfg.rdoq_skip) && !use_trskip)
+      (width > 4 || !state->encoder_control->cfg.rdoq_skip) && !use_trskip) // ISP_TODO: width check here might not be necessary, therefore also height check unnecessary. Investigate.
   {
     int8_t tr_depth = cur_cu->tr_depth - cur_cu->depth;
     tr_depth += (cur_cu->part_size == SIZE_NxN ? 1 : 0);
-    uvg_rdoq(state, coeff, coeff_out, width, width, color,
+    uvg_rdoq(state, coeff, coeff_out, width, height, color,
              scan_order, cur_cu->type, tr_depth, cur_cu->cbf,
       lfnst_index);
   } else if(state->encoder_control->cfg.rdoq_enable && use_trskip) {
-    uvg_ts_rdoq(state, coeff, coeff_out, width, width, color,
+    uvg_ts_rdoq(state, coeff, coeff_out, width, height, color,
       scan_order);
   } else {
   
diff --git a/src/transform.c b/src/transform.c
index 53d27de7..542ab3de 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -1146,8 +1146,9 @@ static void quantize_tr_residual(
   const int32_t lcu_width = LCU_WIDTH >> shift;
   const int8_t mode =
     (color == COLOR_Y) ? cur_pu->intra.mode : cur_pu->intra.mode_chroma;
+  
   const coeff_scan_order_t scan_idx =
-    uvg_get_scan_order(cur_pu->type, mode, depth);
+    uvg_get_scan_order(cur_pu->type, mode, depth); // Height does not affect this
   const int offset = lcu_px.x + lcu_px.y * lcu_width;
   const int z_index = xy_to_zorder(lcu_width, lcu_px.x, lcu_px.y);
 
@@ -1269,6 +1270,7 @@ static void quantize_tr_residual(
     
   }
 
+  // ISP_TODO: when other ISP things work, ask Joose about this
   cbf_clear(&cur_pu->cbf, depth, color);
   if (has_coeffs) {
     cbf_set(&cur_pu->cbf, depth, color);

From 55d77c6b507bc9e3054896fe974ece65847aa66c Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Mon, 8 Aug 2022 16:36:56 +0300
Subject: [PATCH 012/254] [isp] Add scan order tables for all possible block
 sizes.

---
 src/tables.c | 2511 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 2511 insertions(+)

diff --git a/src/tables.c b/src/tables.c
index 422fd714..8c42964d 100644
--- a/src/tables.c
+++ b/src/tables.c
@@ -82,3 +82,2514 @@ const uint32_t* const uvg_g_sig_last_scan[3][5] = {
   {g_sig_last_scan_1_0, g_sig_last_scan_1_1, g_sig_last_scan_1_2, g_sig_last_scan_1_3, g_sig_last_scan_1_4},
   {g_sig_last_scan_2_0, g_sig_last_scan_2_1, g_sig_last_scan_2_2, g_sig_last_scan_2_3, g_sig_last_scan_2_4}
 };
+
+// Holds scan order indices for all possible block sizes for diagonal scan order and coefficient group scan order
+static const uint32_t* const g_scan_order_buffer[32258] = {
+   0,    0,    1,    0,    1,    2,    3,    0,    1,    2,    3,    4,    5,
+   6,    7,    0,    1,    2,    3,    4,    5,    6,    7,    8,    9,   10,
+  11,   12,   13,   14,   15,    0,    1,    2,    3,    4,    5,    6,    7,
+   8,    9,   10,   11,   12,   13,   14,   15,   16,   17,   18,   19,   20,
+  21,   22,   23,   24,   25,   26,   27,   28,   29,   30,   31,    0,    1,
+   2,    3,    4,    5,    6,    7,    8,    9,   10,   11,   12,   13,   14,
+  15,   16,   17,   18,   19,   20,   21,   22,   23,   24,   25,   26,   27,
+  28,   29,   30,   31,   32,   33,   34,   35,   36,   37,   38,   39,   40,
+  41,   42,   43,   44,   45,   46,   47,   48,   49,   50,   51,   52,   53,
+  54,   55,   56,   57,   58,   59,   60,   61,   62,   63,    0,    1,    0,
+   2,    1,    3,    0,    2,    1,    4,    3,    6,    5,    7,    0,    2,
+   1,    4,    3,    6,    5,    8,    7,   10,    9,   12,   11,   14,   13,
+  15,    0,    2,    1,    4,    3,    6,    5,    8,    7,   10,    9,   12,
+  11,   14,   13,   16,   15,   18,   17,   20,   19,   22,   21,   24,   23,
+  26,   25,   28,   27,   30,   29,   31,    0,    2,    1,    4,    3,    6,
+   5,    8,    7,   10,    9,   12,   11,   14,   13,   16,   15,   18,   17,
+  20,   19,   22,   21,   24,   23,   26,   25,   28,   27,   30,   29,   32,
+  31,   34,   33,   36,   35,   38,   37,   40,   39,   42,   41,   44,   43,
+  46,   45,   48,   47,   50,   49,   52,   51,   54,   53,   56,   55,   58,
+  57,   60,   59,   62,   61,   63,    0,    2,    1,    4,    3,    6,    5,
+   8,    7,   10,    9,   12,   11,   14,   13,   16,   15,   18,   17,   20,
+  19,   22,   21,   24,   23,   26,   25,   28,   27,   30,   29,   32,   31,
+  34,   33,   36,   35,   38,   37,   40,   39,   42,   41,   44,   43,   46,
+  45,   48,   47,   50,   49,   52,   51,   54,   53,   56,   55,   58,   57,
+  60,   59,   62,   61,   64,   63,   66,   65,   68,   67,   70,   69,   72,
+  71,   74,   73,   76,   75,   78,   77,   80,   79,   82,   81,   84,   83,
+  86,   85,   88,   87,   90,   89,   92,   91,   94,   93,   96,   95,   98,
+  97,  100,   99,  102,  101,  104,  103,  106,  105,  108,  107,  110,  109,
+ 112,  111,  114,  113,  116,  115,  118,  117,  120,  119,  122,  121,  124,
+ 123,  126,  125,  127,    0,    1,    2,    3,    0,    4,    1,    5,    2,
+   6,    3,    7,    0,    4,    1,    8,    5,    2,   12,    9,    6,    3,
+  13,   10,    7,   14,   11,   15,    0,    4,    1,    8,    5,    2,   12,
+   9,    6,    3,   16,   13,   10,    7,   20,   17,   14,   11,   24,   21,
+  18,   15,   28,   25,   22,   19,   29,   26,   23,   30,   27,   31,    0,
+   4,    1,    8,    5,    2,   12,    9,    6,    3,   16,   13,   10,    7,
+  20,   17,   14,   11,   24,   21,   18,   15,   28,   25,   22,   19,   32,
+  29,   26,   23,   36,   33,   30,   27,   40,   37,   34,   31,   44,   41,
+  38,   35,   48,   45,   42,   39,   52,   49,   46,   43,   56,   53,   50,
+  47,   60,   57,   54,   51,   61,   58,   55,   62,   59,   63,    0,    4,
+   1,    8,    5,    2,   12,    9,    6,    3,   16,   13,   10,    7,   20,
+  17,   14,   11,   24,   21,   18,   15,   28,   25,   22,   19,   32,   29,
+  26,   23,   36,   33,   30,   27,   40,   37,   34,   31,   44,   41,   38,
+  35,   48,   45,   42,   39,   52,   49,   46,   43,   56,   53,   50,   47,
+  60,   57,   54,   51,   64,   61,   58,   55,   68,   65,   62,   59,   72,
+  69,   66,   63,   76,   73,   70,   67,   80,   77,   74,   71,   84,   81,
+  78,   75,   88,   85,   82,   79,   92,   89,   86,   83,   96,   93,   90,
+  87,  100,   97,   94,   91,  104,  101,   98,   95,  108,  105,  102,   99,
+ 112,  109,  106,  103,  116,  113,  110,  107,  120,  117,  114,  111,  124,
+ 121,  118,  115,  125,  122,  119,  126,  123,  127,    0,    4,    1,    8,
+   5,    2,   12,    9,    6,    3,   16,   13,   10,    7,   20,   17,   14,
+  11,   24,   21,   18,   15,   28,   25,   22,   19,   32,   29,   26,   23,
+  36,   33,   30,   27,   40,   37,   34,   31,   44,   41,   38,   35,   48,
+  45,   42,   39,   52,   49,   46,   43,   56,   53,   50,   47,   60,   57,
+  54,   51,   64,   61,   58,   55,   68,   65,   62,   59,   72,   69,   66,
+  63,   76,   73,   70,   67,   80,   77,   74,   71,   84,   81,   78,   75,
+  88,   85,   82,   79,   92,   89,   86,   83,   96,   93,   90,   87,  100,
+  97,   94,   91,  104,  101,   98,   95,  108,  105,  102,   99,  112,  109,
+ 106,  103,  116,  113,  110,  107,  120,  117,  114,  111,  124,  121,  118,
+ 115,  128,  125,  122,  119,  132,  129,  126,  123,  136,  133,  130,  127,
+ 140,  137,  134,  131,  144,  141,  138,  135,  148,  145,  142,  139,  152,
+ 149,  146,  143,  156,  153,  150,  147,  160,  157,  154,  151,  164,  161,
+ 158,  155,  168,  165,  162,  159,  172,  169,  166,  163,  176,  173,  170,
+ 167,  180,  177,  174,  171,  184,  181,  178,  175,  188,  185,  182,  179,
+ 192,  189,  186,  183,  196,  193,  190,  187,  200,  197,  194,  191,  204,
+ 201,  198,  195,  208,  205,  202,  199,  212,  209,  206,  203,  216,  213,
+ 210,  207,  220,  217,  214,  211,  224,  221,  218,  215,  228,  225,  222,
+ 219,  232,  229,  226,  223,  236,  233,  230,  227,  240,  237,  234,  231,
+ 244,  241,  238,  235,  248,  245,  242,  239,  252,  249,  246,  243,  253,
+ 250,  247,  254,  251,  255,    0,    1,    2,    3,    4,    5,    6,    7,
+   0,    8,    1,    9,    2,   10,    3,   11,    4,   12,    5,   13,    6,
+  14,    7,   15,    0,    8,    1,   16,    9,    2,   24,   17,   10,    3,
+  25,   18,   11,    4,   26,   19,   12,    5,   27,   20,   13,    6,   28,
+  21,   14,    7,   29,   22,   15,   30,   23,   31,    0,    8,    1,   16,
+   9,    2,   24,   17,   10,    3,   32,   25,   18,   11,    4,   40,   33,
+  26,   19,   12,    5,   48,   41,   34,   27,   20,   13,    6,   56,   49,
+  42,   35,   28,   21,   14,    7,   57,   50,   43,   36,   29,   22,   15,
+  58,   51,   44,   37,   30,   23,   59,   52,   45,   38,   31,   60,   53,
+  46,   39,   61,   54,   47,   62,   55,   63,    0,    8,    1,   16,    9,
+   2,   24,   17,   10,    3,   32,   25,   18,   11,    4,   40,   33,   26,
+  19,   12,    5,   48,   41,   34,   27,   20,   13,    6,   56,   49,   42,
+  35,   28,   21,   14,    7,   64,   57,   50,   43,   36,   29,   22,   15,
+  72,   65,   58,   51,   44,   37,   30,   23,   80,   73,   66,   59,   52,
+  45,   38,   31,   88,   81,   74,   67,   60,   53,   46,   39,   96,   89,
+  82,   75,   68,   61,   54,   47,  104,   97,   90,   83,   76,   69,   62,
+  55,  112,  105,   98,   91,   84,   77,   70,   63,  120,  113,  106,   99,
+  92,   85,   78,   71,  121,  114,  107,  100,   93,   86,   79,  122,  115,
+ 108,  101,   94,   87,  123,  116,  109,  102,   95,  124,  117,  110,  103,
+ 125,  118,  111,  126,  119,  127,    0,    8,    1,   16,    9,    2,   24,
+  17,   10,    3,   32,   25,   18,   11,    4,   40,   33,   26,   19,   12,
+   5,   48,   41,   34,   27,   20,   13,    6,   56,   49,   42,   35,   28,
+  21,   14,    7,   64,   57,   50,   43,   36,   29,   22,   15,   72,   65,
+  58,   51,   44,   37,   30,   23,   80,   73,   66,   59,   52,   45,   38,
+  31,   88,   81,   74,   67,   60,   53,   46,   39,   96,   89,   82,   75,
+  68,   61,   54,   47,  104,   97,   90,   83,   76,   69,   62,   55,  112,
+ 105,   98,   91,   84,   77,   70,   63,  120,  113,  106,   99,   92,   85,
+  78,   71,  128,  121,  114,  107,  100,   93,   86,   79,  136,  129,  122,
+ 115,  108,  101,   94,   87,  144,  137,  130,  123,  116,  109,  102,   95,
+ 152,  145,  138,  131,  124,  117,  110,  103,  160,  153,  146,  139,  132,
+ 125,  118,  111,  168,  161,  154,  147,  140,  133,  126,  119,  176,  169,
+ 162,  155,  148,  141,  134,  127,  184,  177,  170,  163,  156,  149,  142,
+ 135,  192,  185,  178,  171,  164,  157,  150,  143,  200,  193,  186,  179,
+ 172,  165,  158,  151,  208,  201,  194,  187,  180,  173,  166,  159,  216,
+ 209,  202,  195,  188,  181,  174,  167,  224,  217,  210,  203,  196,  189,
+ 182,  175,  232,  225,  218,  211,  204,  197,  190,  183,  240,  233,  226,
+ 219,  212,  205,  198,  191,  248,  241,  234,  227,  220,  213,  206,  199,
+ 249,  242,  235,  228,  221,  214,  207,  250,  243,  236,  229,  222,  215,
+ 251,  244,  237,  230,  223,  252,  245,  238,  231,  253,  246,  239,  254,
+ 247,  255,    0,    8,    1,   16,    9,    2,   24,   17,   10,    3,   32,
+  25,   18,   11,    4,   40,   33,   26,   19,   12,    5,   48,   41,   34,
+  27,   20,   13,    6,   56,   49,   42,   35,   28,   21,   14,    7,   64,
+  57,   50,   43,   36,   29,   22,   15,   72,   65,   58,   51,   44,   37,
+  30,   23,   80,   73,   66,   59,   52,   45,   38,   31,   88,   81,   74,
+  67,   60,   53,   46,   39,   96,   89,   82,   75,   68,   61,   54,   47,
+ 104,   97,   90,   83,   76,   69,   62,   55,  112,  105,   98,   91,   84,
+  77,   70,   63,  120,  113,  106,   99,   92,   85,   78,   71,  128,  121,
+ 114,  107,  100,   93,   86,   79,  136,  129,  122,  115,  108,  101,   94,
+  87,  144,  137,  130,  123,  116,  109,  102,   95,  152,  145,  138,  131,
+ 124,  117,  110,  103,  160,  153,  146,  139,  132,  125,  118,  111,  168,
+ 161,  154,  147,  140,  133,  126,  119,  176,  169,  162,  155,  148,  141,
+ 134,  127,  184,  177,  170,  163,  156,  149,  142,  135,  192,  185,  178,
+ 171,  164,  157,  150,  143,  200,  193,  186,  179,  172,  165,  158,  151,
+ 208,  201,  194,  187,  180,  173,  166,  159,  216,  209,  202,  195,  188,
+ 181,  174,  167,  224,  217,  210,  203,  196,  189,  182,  175,  232,  225,
+ 218,  211,  204,  197,  190,  183,  240,  233,  226,  219,  212,  205,  198,
+ 191,  248,  241,  234,  227,  220,  213,  206,  199,  256,  249,  242,  235,
+ 228,  221,  214,  207,  264,  257,  250,  243,  236,  229,  222,  215,  272,
+ 265,  258,  251,  244,  237,  230,  223,  280,  273,  266,  259,  252,  245,
+ 238,  231,  288,  281,  274,  267,  260,  253,  246,  239,  296,  289,  282,
+ 275,  268,  261,  254,  247,  304,  297,  290,  283,  276,  269,  262,  255,
+ 312,  305,  298,  291,  284,  277,  270,  263,  320,  313,  306,  299,  292,
+ 285,  278,  271,  328,  321,  314,  307,  300,  293,  286,  279,  336,  329,
+ 322,  315,  308,  301,  294,  287,  344,  337,  330,  323,  316,  309,  302,
+ 295,  352,  345,  338,  331,  324,  317,  310,  303,  360,  353,  346,  339,
+ 332,  325,  318,  311,  368,  361,  354,  347,  340,  333,  326,  319,  376,
+ 369,  362,  355,  348,  341,  334,  327,  384,  377,  370,  363,  356,  349,
+ 342,  335,  392,  385,  378,  371,  364,  357,  350,  343,  400,  393,  386,
+ 379,  372,  365,  358,  351,  408,  401,  394,  387,  380,  373,  366,  359,
+ 416,  409,  402,  395,  388,  381,  374,  367,  424,  417,  410,  403,  396,
+ 389,  382,  375,  432,  425,  418,  411,  404,  397,  390,  383,  440,  433,
+ 426,  419,  412,  405,  398,  391,  448,  441,  434,  427,  420,  413,  406,
+ 399,  456,  449,  442,  435,  428,  421,  414,  407,  464,  457,  450,  443,
+ 436,  429,  422,  415,  472,  465,  458,  451,  444,  437,  430,  423,  480,
+ 473,  466,  459,  452,  445,  438,  431,  488,  481,  474,  467,  460,  453,
+ 446,  439,  496,  489,  482,  475,  468,  461,  454,  447,  504,  497,  490,
+ 483,  476,  469,  462,  455,  505,  498,  491,  484,  477,  470,  463,  506,
+ 499,  492,  485,  478,  471,  507,  500,  493,  486,  479,  508,  501,  494,
+ 487,  509,  502,  495,  510,  503,  511,    0,    1,    2,    3,    4,    5,
+   6,    7,    8,    9,   10,   11,   12,   13,   14,   15,    0,   16,    1,
+  17,    2,   18,    3,   19,    4,   20,    5,   21,    6,   22,    7,   23,
+   8,   24,    9,   25,   10,   26,   11,   27,   12,   28,   13,   29,   14,
+  30,   15,   31,    0,   16,    1,   32,   17,    2,   48,   33,   18,    3,
+  49,   34,   19,    4,   50,   35,   20,    5,   51,   36,   21,    6,   52,
+  37,   22,    7,   53,   38,   23,    8,   54,   39,   24,    9,   55,   40,
+  25,   10,   56,   41,   26,   11,   57,   42,   27,   12,   58,   43,   28,
+  13,   59,   44,   29,   14,   60,   45,   30,   15,   61,   46,   31,   62,
+  47,   63,    0,   16,    1,   32,   17,    2,   48,   33,   18,    3,   64,
+  49,   34,   19,    4,   80,   65,   50,   35,   20,    5,   96,   81,   66,
+  51,   36,   21,    6,  112,   97,   82,   67,   52,   37,   22,    7,  113,
+  98,   83,   68,   53,   38,   23,    8,  114,   99,   84,   69,   54,   39,
+  24,    9,  115,  100,   85,   70,   55,   40,   25,   10,  116,  101,   86,
+  71,   56,   41,   26,   11,  117,  102,   87,   72,   57,   42,   27,   12,
+ 118,  103,   88,   73,   58,   43,   28,   13,  119,  104,   89,   74,   59,
+  44,   29,   14,  120,  105,   90,   75,   60,   45,   30,   15,  121,  106,
+  91,   76,   61,   46,   31,  122,  107,   92,   77,   62,   47,  123,  108,
+  93,   78,   63,  124,  109,   94,   79,  125,  110,   95,  126,  111,  127,
+   0,   16,    1,   32,   17,    2,   48,   33,   18,    3,   64,   49,   34,
+  19,    4,   80,   65,   50,   35,   20,    5,   96,   81,   66,   51,   36,
+  21,    6,  112,   97,   82,   67,   52,   37,   22,    7,  128,  113,   98,
+  83,   68,   53,   38,   23,    8,  144,  129,  114,   99,   84,   69,   54,
+  39,   24,    9,  160,  145,  130,  115,  100,   85,   70,   55,   40,   25,
+  10,  176,  161,  146,  131,  116,  101,   86,   71,   56,   41,   26,   11,
+ 192,  177,  162,  147,  132,  117,  102,   87,   72,   57,   42,   27,   12,
+ 208,  193,  178,  163,  148,  133,  118,  103,   88,   73,   58,   43,   28,
+  13,  224,  209,  194,  179,  164,  149,  134,  119,  104,   89,   74,   59,
+  44,   29,   14,  240,  225,  210,  195,  180,  165,  150,  135,  120,  105,
+  90,   75,   60,   45,   30,   15,  241,  226,  211,  196,  181,  166,  151,
+ 136,  121,  106,   91,   76,   61,   46,   31,  242,  227,  212,  197,  182,
+ 167,  152,  137,  122,  107,   92,   77,   62,   47,  243,  228,  213,  198,
+ 183,  168,  153,  138,  123,  108,   93,   78,   63,  244,  229,  214,  199,
+ 184,  169,  154,  139,  124,  109,   94,   79,  245,  230,  215,  200,  185,
+ 170,  155,  140,  125,  110,   95,  246,  231,  216,  201,  186,  171,  156,
+ 141,  126,  111,  247,  232,  217,  202,  187,  172,  157,  142,  127,  248,
+ 233,  218,  203,  188,  173,  158,  143,  249,  234,  219,  204,  189,  174,
+ 159,  250,  235,  220,  205,  190,  175,  251,  236,  221,  206,  191,  252,
+ 237,  222,  207,  253,  238,  223,  254,  239,  255,    0,   16,    1,   32,
+  17,    2,   48,   33,   18,    3,   64,   49,   34,   19,    4,   80,   65,
+  50,   35,   20,    5,   96,   81,   66,   51,   36,   21,    6,  112,   97,
+  82,   67,   52,   37,   22,    7,  128,  113,   98,   83,   68,   53,   38,
+  23,    8,  144,  129,  114,   99,   84,   69,   54,   39,   24,    9,  160,
+ 145,  130,  115,  100,   85,   70,   55,   40,   25,   10,  176,  161,  146,
+ 131,  116,  101,   86,   71,   56,   41,   26,   11,  192,  177,  162,  147,
+ 132,  117,  102,   87,   72,   57,   42,   27,   12,  208,  193,  178,  163,
+ 148,  133,  118,  103,   88,   73,   58,   43,   28,   13,  224,  209,  194,
+ 179,  164,  149,  134,  119,  104,   89,   74,   59,   44,   29,   14,  240,
+ 225,  210,  195,  180,  165,  150,  135,  120,  105,   90,   75,   60,   45,
+  30,   15,  256,  241,  226,  211,  196,  181,  166,  151,  136,  121,  106,
+  91,   76,   61,   46,   31,  272,  257,  242,  227,  212,  197,  182,  167,
+ 152,  137,  122,  107,   92,   77,   62,   47,  288,  273,  258,  243,  228,
+ 213,  198,  183,  168,  153,  138,  123,  108,   93,   78,   63,  304,  289,
+ 274,  259,  244,  229,  214,  199,  184,  169,  154,  139,  124,  109,   94,
+  79,  320,  305,  290,  275,  260,  245,  230,  215,  200,  185,  170,  155,
+ 140,  125,  110,   95,  336,  321,  306,  291,  276,  261,  246,  231,  216,
+ 201,  186,  171,  156,  141,  126,  111,  352,  337,  322,  307,  292,  277,
+ 262,  247,  232,  217,  202,  187,  172,  157,  142,  127,  368,  353,  338,
+ 323,  308,  293,  278,  263,  248,  233,  218,  203,  188,  173,  158,  143,
+ 384,  369,  354,  339,  324,  309,  294,  279,  264,  249,  234,  219,  204,
+ 189,  174,  159,  400,  385,  370,  355,  340,  325,  310,  295,  280,  265,
+ 250,  235,  220,  205,  190,  175,  416,  401,  386,  371,  356,  341,  326,
+ 311,  296,  281,  266,  251,  236,  221,  206,  191,  432,  417,  402,  387,
+ 372,  357,  342,  327,  312,  297,  282,  267,  252,  237,  222,  207,  448,
+ 433,  418,  403,  388,  373,  358,  343,  328,  313,  298,  283,  268,  253,
+ 238,  223,  464,  449,  434,  419,  404,  389,  374,  359,  344,  329,  314,
+ 299,  284,  269,  254,  239,  480,  465,  450,  435,  420,  405,  390,  375,
+ 360,  345,  330,  315,  300,  285,  270,  255,  496,  481,  466,  451,  436,
+ 421,  406,  391,  376,  361,  346,  331,  316,  301,  286,  271,  497,  482,
+ 467,  452,  437,  422,  407,  392,  377,  362,  347,  332,  317,  302,  287,
+ 498,  483,  468,  453,  438,  423,  408,  393,  378,  363,  348,  333,  318,
+ 303,  499,  484,  469,  454,  439,  424,  409,  394,  379,  364,  349,  334,
+ 319,  500,  485,  470,  455,  440,  425,  410,  395,  380,  365,  350,  335,
+ 501,  486,  471,  456,  441,  426,  411,  396,  381,  366,  351,  502,  487,
+ 472,  457,  442,  427,  412,  397,  382,  367,  503,  488,  473,  458,  443,
+ 428,  413,  398,  383,  504,  489,  474,  459,  444,  429,  414,  399,  505,
+ 490,  475,  460,  445,  430,  415,  506,  491,  476,  461,  446,  431,  507,
+ 492,  477,  462,  447,  508,  493,  478,  463,  509,  494,  479,  510,  495,
+ 511,    0,   16,    1,   32,   17,    2,   48,   33,   18,    3,   64,   49,
+  34,   19,    4,   80,   65,   50,   35,   20,    5,   96,   81,   66,   51,
+  36,   21,    6,  112,   97,   82,   67,   52,   37,   22,    7,  128,  113,
+  98,   83,   68,   53,   38,   23,    8,  144,  129,  114,   99,   84,   69,
+  54,   39,   24,    9,  160,  145,  130,  115,  100,   85,   70,   55,   40,
+  25,   10,  176,  161,  146,  131,  116,  101,   86,   71,   56,   41,   26,
+  11,  192,  177,  162,  147,  132,  117,  102,   87,   72,   57,   42,   27,
+  12,  208,  193,  178,  163,  148,  133,  118,  103,   88,   73,   58,   43,
+  28,   13,  224,  209,  194,  179,  164,  149,  134,  119,  104,   89,   74,
+  59,   44,   29,   14,  240,  225,  210,  195,  180,  165,  150,  135,  120,
+ 105,   90,   75,   60,   45,   30,   15,  256,  241,  226,  211,  196,  181,
+ 166,  151,  136,  121,  106,   91,   76,   61,   46,   31,  272,  257,  242,
+ 227,  212,  197,  182,  167,  152,  137,  122,  107,   92,   77,   62,   47,
+ 288,  273,  258,  243,  228,  213,  198,  183,  168,  153,  138,  123,  108,
+  93,   78,   63,  304,  289,  274,  259,  244,  229,  214,  199,  184,  169,
+ 154,  139,  124,  109,   94,   79,  320,  305,  290,  275,  260,  245,  230,
+ 215,  200,  185,  170,  155,  140,  125,  110,   95,  336,  321,  306,  291,
+ 276,  261,  246,  231,  216,  201,  186,  171,  156,  141,  126,  111,  352,
+ 337,  322,  307,  292,  277,  262,  247,  232,  217,  202,  187,  172,  157,
+ 142,  127,  368,  353,  338,  323,  308,  293,  278,  263,  248,  233,  218,
+ 203,  188,  173,  158,  143,  384,  369,  354,  339,  324,  309,  294,  279,
+ 264,  249,  234,  219,  204,  189,  174,  159,  400,  385,  370,  355,  340,
+ 325,  310,  295,  280,  265,  250,  235,  220,  205,  190,  175,  416,  401,
+ 386,  371,  356,  341,  326,  311,  296,  281,  266,  251,  236,  221,  206,
+ 191,  432,  417,  402,  387,  372,  357,  342,  327,  312,  297,  282,  267,
+ 252,  237,  222,  207,  448,  433,  418,  403,  388,  373,  358,  343,  328,
+ 313,  298,  283,  268,  253,  238,  223,  464,  449,  434,  419,  404,  389,
+ 374,  359,  344,  329,  314,  299,  284,  269,  254,  239,  480,  465,  450,
+ 435,  420,  405,  390,  375,  360,  345,  330,  315,  300,  285,  270,  255,
+ 496,  481,  466,  451,  436,  421,  406,  391,  376,  361,  346,  331,  316,
+ 301,  286,  271,  512,  497,  482,  467,  452,  437,  422,  407,  392,  377,
+ 362,  347,  332,  317,  302,  287,  528,  513,  498,  483,  468,  453,  438,
+ 423,  408,  393,  378,  363,  348,  333,  318,  303,  544,  529,  514,  499,
+ 484,  469,  454,  439,  424,  409,  394,  379,  364,  349,  334,  319,  560,
+ 545,  530,  515,  500,  485,  470,  455,  440,  425,  410,  395,  380,  365,
+ 350,  335,  576,  561,  546,  531,  516,  501,  486,  471,  456,  441,  426,
+ 411,  396,  381,  366,  351,  592,  577,  562,  547,  532,  517,  502,  487,
+ 472,  457,  442,  427,  412,  397,  382,  367,  608,  593,  578,  563,  548,
+ 533,  518,  503,  488,  473,  458,  443,  428,  413,  398,  383,  624,  609,
+ 594,  579,  564,  549,  534,  519,  504,  489,  474,  459,  444,  429,  414,
+ 399,  640,  625,  610,  595,  580,  565,  550,  535,  520,  505,  490,  475,
+ 460,  445,  430,  415,  656,  641,  626,  611,  596,  581,  566,  551,  536,
+ 521,  506,  491,  476,  461,  446,  431,  672,  657,  642,  627,  612,  597,
+ 582,  567,  552,  537,  522,  507,  492,  477,  462,  447,  688,  673,  658,
+ 643,  628,  613,  598,  583,  568,  553,  538,  523,  508,  493,  478,  463,
+ 704,  689,  674,  659,  644,  629,  614,  599,  584,  569,  554,  539,  524,
+ 509,  494,  479,  720,  705,  690,  675,  660,  645,  630,  615,  600,  585,
+ 570,  555,  540,  525,  510,  495,  736,  721,  706,  691,  676,  661,  646,
+ 631,  616,  601,  586,  571,  556,  541,  526,  511,  752,  737,  722,  707,
+ 692,  677,  662,  647,  632,  617,  602,  587,  572,  557,  542,  527,  768,
+ 753,  738,  723,  708,  693,  678,  663,  648,  633,  618,  603,  588,  573,
+ 558,  543,  784,  769,  754,  739,  724,  709,  694,  679,  664,  649,  634,
+ 619,  604,  589,  574,  559,  800,  785,  770,  755,  740,  725,  710,  695,
+ 680,  665,  650,  635,  620,  605,  590,  575,  816,  801,  786,  771,  756,
+ 741,  726,  711,  696,  681,  666,  651,  636,  621,  606,  591,  832,  817,
+ 802,  787,  772,  757,  742,  727,  712,  697,  682,  667,  652,  637,  622,
+ 607,  848,  833,  818,  803,  788,  773,  758,  743,  728,  713,  698,  683,
+ 668,  653,  638,  623,  864,  849,  834,  819,  804,  789,  774,  759,  744,
+ 729,  714,  699,  684,  669,  654,  639,  880,  865,  850,  835,  820,  805,
+ 790,  775,  760,  745,  730,  715,  700,  685,  670,  655,  896,  881,  866,
+ 851,  836,  821,  806,  791,  776,  761,  746,  731,  716,  701,  686,  671,
+ 912,  897,  882,  867,  852,  837,  822,  807,  792,  777,  762,  747,  732,
+ 717,  702,  687,  928,  913,  898,  883,  868,  853,  838,  823,  808,  793,
+ 778,  763,  748,  733,  718,  703,  944,  929,  914,  899,  884,  869,  854,
+ 839,  824,  809,  794,  779,  764,  749,  734,  719,  960,  945,  930,  915,
+ 900,  885,  870,  855,  840,  825,  810,  795,  780,  765,  750,  735,  976,
+ 961,  946,  931,  916,  901,  886,  871,  856,  841,  826,  811,  796,  781,
+ 766,  751,  992,  977,  962,  947,  932,  917,  902,  887,  872,  857,  842,
+ 827,  812,  797,  782,  767, 1008,  993,  978,  963,  948,  933,  918,  903,
+ 888,  873,  858,  843,  828,  813,  798,  783, 1009,  994,  979,  964,  949,
+ 934,  919,  904,  889,  874,  859,  844,  829,  814,  799, 1010,  995,  980,
+ 965,  950,  935,  920,  905,  890,  875,  860,  845,  830,  815, 1011,  996,
+ 981,  966,  951,  936,  921,  906,  891,  876,  861,  846,  831, 1012,  997,
+ 982,  967,  952,  937,  922,  907,  892,  877,  862,  847, 1013,  998,  983,
+ 968,  953,  938,  923,  908,  893,  878,  863, 1014,  999,  984,  969,  954,
+ 939,  924,  909,  894,  879, 1015, 1000,  985,  970,  955,  940,  925,  910,
+ 895, 1016, 1001,  986,  971,  956,  941,  926,  911, 1017, 1002,  987,  972,
+ 957,  942,  927, 1018, 1003,  988,  973,  958,  943, 1019, 1004,  989,  974,
+ 959, 1020, 1005,  990,  975, 1021, 1006,  991, 1022, 1007, 1023,    0,    1,
+   2,    3,    4,    5,    6,    7,    8,    9,   10,   11,   12,   13,   14,
+  15,   16,   17,   18,   19,   20,   21,   22,   23,   24,   25,   26,   27,
+  28,   29,   30,   31,    0,   32,    1,   33,    2,   34,    3,   35,    4,
+  36,    5,   37,    6,   38,    7,   39,    8,   40,    9,   41,   10,   42,
+  11,   43,   12,   44,   13,   45,   14,   46,   15,   47,   16,   48,   17,
+  49,   18,   50,   19,   51,   20,   52,   21,   53,   22,   54,   23,   55,
+  24,   56,   25,   57,   26,   58,   27,   59,   28,   60,   29,   61,   30,
+  62,   31,   63,    0,   32,    1,   64,   33,    2,   96,   65,   34,    3,
+  97,   66,   35,    4,   98,   67,   36,    5,   99,   68,   37,    6,  100,
+  69,   38,    7,  101,   70,   39,    8,  102,   71,   40,    9,  103,   72,
+  41,   10,  104,   73,   42,   11,  105,   74,   43,   12,  106,   75,   44,
+  13,  107,   76,   45,   14,  108,   77,   46,   15,  109,   78,   47,   16,
+ 110,   79,   48,   17,  111,   80,   49,   18,  112,   81,   50,   19,  113,
+  82,   51,   20,  114,   83,   52,   21,  115,   84,   53,   22,  116,   85,
+  54,   23,  117,   86,   55,   24,  118,   87,   56,   25,  119,   88,   57,
+  26,  120,   89,   58,   27,  121,   90,   59,   28,  122,   91,   60,   29,
+ 123,   92,   61,   30,  124,   93,   62,   31,  125,   94,   63,  126,   95,
+ 127,    0,   32,    1,   64,   33,    2,   96,   65,   34,    3,  128,   97,
+  66,   35,    4,  160,  129,   98,   67,   36,    5,  192,  161,  130,   99,
+  68,   37,    6,  224,  193,  162,  131,  100,   69,   38,    7,  225,  194,
+ 163,  132,  101,   70,   39,    8,  226,  195,  164,  133,  102,   71,   40,
+   9,  227,  196,  165,  134,  103,   72,   41,   10,  228,  197,  166,  135,
+ 104,   73,   42,   11,  229,  198,  167,  136,  105,   74,   43,   12,  230,
+ 199,  168,  137,  106,   75,   44,   13,  231,  200,  169,  138,  107,   76,
+  45,   14,  232,  201,  170,  139,  108,   77,   46,   15,  233,  202,  171,
+ 140,  109,   78,   47,   16,  234,  203,  172,  141,  110,   79,   48,   17,
+ 235,  204,  173,  142,  111,   80,   49,   18,  236,  205,  174,  143,  112,
+  81,   50,   19,  237,  206,  175,  144,  113,   82,   51,   20,  238,  207,
+ 176,  145,  114,   83,   52,   21,  239,  208,  177,  146,  115,   84,   53,
+  22,  240,  209,  178,  147,  116,   85,   54,   23,  241,  210,  179,  148,
+ 117,   86,   55,   24,  242,  211,  180,  149,  118,   87,   56,   25,  243,
+ 212,  181,  150,  119,   88,   57,   26,  244,  213,  182,  151,  120,   89,
+  58,   27,  245,  214,  183,  152,  121,   90,   59,   28,  246,  215,  184,
+ 153,  122,   91,   60,   29,  247,  216,  185,  154,  123,   92,   61,   30,
+ 248,  217,  186,  155,  124,   93,   62,   31,  249,  218,  187,  156,  125,
+  94,   63,  250,  219,  188,  157,  126,   95,  251,  220,  189,  158,  127,
+ 252,  221,  190,  159,  253,  222,  191,  254,  223,  255,    0,   32,    1,
+  64,   33,    2,   96,   65,   34,    3,  128,   97,   66,   35,    4,  160,
+ 129,   98,   67,   36,    5,  192,  161,  130,   99,   68,   37,    6,  224,
+ 193,  162,  131,  100,   69,   38,    7,  256,  225,  194,  163,  132,  101,
+  70,   39,    8,  288,  257,  226,  195,  164,  133,  102,   71,   40,    9,
+ 320,  289,  258,  227,  196,  165,  134,  103,   72,   41,   10,  352,  321,
+ 290,  259,  228,  197,  166,  135,  104,   73,   42,   11,  384,  353,  322,
+ 291,  260,  229,  198,  167,  136,  105,   74,   43,   12,  416,  385,  354,
+ 323,  292,  261,  230,  199,  168,  137,  106,   75,   44,   13,  448,  417,
+ 386,  355,  324,  293,  262,  231,  200,  169,  138,  107,   76,   45,   14,
+ 480,  449,  418,  387,  356,  325,  294,  263,  232,  201,  170,  139,  108,
+  77,   46,   15,  481,  450,  419,  388,  357,  326,  295,  264,  233,  202,
+ 171,  140,  109,   78,   47,   16,  482,  451,  420,  389,  358,  327,  296,
+ 265,  234,  203,  172,  141,  110,   79,   48,   17,  483,  452,  421,  390,
+ 359,  328,  297,  266,  235,  204,  173,  142,  111,   80,   49,   18,  484,
+ 453,  422,  391,  360,  329,  298,  267,  236,  205,  174,  143,  112,   81,
+  50,   19,  485,  454,  423,  392,  361,  330,  299,  268,  237,  206,  175,
+ 144,  113,   82,   51,   20,  486,  455,  424,  393,  362,  331,  300,  269,
+ 238,  207,  176,  145,  114,   83,   52,   21,  487,  456,  425,  394,  363,
+ 332,  301,  270,  239,  208,  177,  146,  115,   84,   53,   22,  488,  457,
+ 426,  395,  364,  333,  302,  271,  240,  209,  178,  147,  116,   85,   54,
+  23,  489,  458,  427,  396,  365,  334,  303,  272,  241,  210,  179,  148,
+ 117,   86,   55,   24,  490,  459,  428,  397,  366,  335,  304,  273,  242,
+ 211,  180,  149,  118,   87,   56,   25,  491,  460,  429,  398,  367,  336,
+ 305,  274,  243,  212,  181,  150,  119,   88,   57,   26,  492,  461,  430,
+ 399,  368,  337,  306,  275,  244,  213,  182,  151,  120,   89,   58,   27,
+ 493,  462,  431,  400,  369,  338,  307,  276,  245,  214,  183,  152,  121,
+  90,   59,   28,  494,  463,  432,  401,  370,  339,  308,  277,  246,  215,
+ 184,  153,  122,   91,   60,   29,  495,  464,  433,  402,  371,  340,  309,
+ 278,  247,  216,  185,  154,  123,   92,   61,   30,  496,  465,  434,  403,
+ 372,  341,  310,  279,  248,  217,  186,  155,  124,   93,   62,   31,  497,
+ 466,  435,  404,  373,  342,  311,  280,  249,  218,  187,  156,  125,   94,
+  63,  498,  467,  436,  405,  374,  343,  312,  281,  250,  219,  188,  157,
+ 126,   95,  499,  468,  437,  406,  375,  344,  313,  282,  251,  220,  189,
+ 158,  127,  500,  469,  438,  407,  376,  345,  314,  283,  252,  221,  190,
+ 159,  501,  470,  439,  408,  377,  346,  315,  284,  253,  222,  191,  502,
+ 471,  440,  409,  378,  347,  316,  285,  254,  223,  503,  472,  441,  410,
+ 379,  348,  317,  286,  255,  504,  473,  442,  411,  380,  349,  318,  287,
+ 505,  474,  443,  412,  381,  350,  319,  506,  475,  444,  413,  382,  351,
+ 507,  476,  445,  414,  383,  508,  477,  446,  415,  509,  478,  447,  510,
+ 479,  511,    0,   32,    1,   64,   33,    2,   96,   65,   34,    3,  128,
+  97,   66,   35,    4,  160,  129,   98,   67,   36,    5,  192,  161,  130,
+  99,   68,   37,    6,  224,  193,  162,  131,  100,   69,   38,    7,  256,
+ 225,  194,  163,  132,  101,   70,   39,    8,  288,  257,  226,  195,  164,
+ 133,  102,   71,   40,    9,  320,  289,  258,  227,  196,  165,  134,  103,
+  72,   41,   10,  352,  321,  290,  259,  228,  197,  166,  135,  104,   73,
+  42,   11,  384,  353,  322,  291,  260,  229,  198,  167,  136,  105,   74,
+  43,   12,  416,  385,  354,  323,  292,  261,  230,  199,  168,  137,  106,
+  75,   44,   13,  448,  417,  386,  355,  324,  293,  262,  231,  200,  169,
+ 138,  107,   76,   45,   14,  480,  449,  418,  387,  356,  325,  294,  263,
+ 232,  201,  170,  139,  108,   77,   46,   15,  512,  481,  450,  419,  388,
+ 357,  326,  295,  264,  233,  202,  171,  140,  109,   78,   47,   16,  544,
+ 513,  482,  451,  420,  389,  358,  327,  296,  265,  234,  203,  172,  141,
+ 110,   79,   48,   17,  576,  545,  514,  483,  452,  421,  390,  359,  328,
+ 297,  266,  235,  204,  173,  142,  111,   80,   49,   18,  608,  577,  546,
+ 515,  484,  453,  422,  391,  360,  329,  298,  267,  236,  205,  174,  143,
+ 112,   81,   50,   19,  640,  609,  578,  547,  516,  485,  454,  423,  392,
+ 361,  330,  299,  268,  237,  206,  175,  144,  113,   82,   51,   20,  672,
+ 641,  610,  579,  548,  517,  486,  455,  424,  393,  362,  331,  300,  269,
+ 238,  207,  176,  145,  114,   83,   52,   21,  704,  673,  642,  611,  580,
+ 549,  518,  487,  456,  425,  394,  363,  332,  301,  270,  239,  208,  177,
+ 146,  115,   84,   53,   22,  736,  705,  674,  643,  612,  581,  550,  519,
+ 488,  457,  426,  395,  364,  333,  302,  271,  240,  209,  178,  147,  116,
+  85,   54,   23,  768,  737,  706,  675,  644,  613,  582,  551,  520,  489,
+ 458,  427,  396,  365,  334,  303,  272,  241,  210,  179,  148,  117,   86,
+  55,   24,  800,  769,  738,  707,  676,  645,  614,  583,  552,  521,  490,
+ 459,  428,  397,  366,  335,  304,  273,  242,  211,  180,  149,  118,   87,
+  56,   25,  832,  801,  770,  739,  708,  677,  646,  615,  584,  553,  522,
+ 491,  460,  429,  398,  367,  336,  305,  274,  243,  212,  181,  150,  119,
+  88,   57,   26,  864,  833,  802,  771,  740,  709,  678,  647,  616,  585,
+ 554,  523,  492,  461,  430,  399,  368,  337,  306,  275,  244,  213,  182,
+ 151,  120,   89,   58,   27,  896,  865,  834,  803,  772,  741,  710,  679,
+ 648,  617,  586,  555,  524,  493,  462,  431,  400,  369,  338,  307,  276,
+ 245,  214,  183,  152,  121,   90,   59,   28,  928,  897,  866,  835,  804,
+ 773,  742,  711,  680,  649,  618,  587,  556,  525,  494,  463,  432,  401,
+ 370,  339,  308,  277,  246,  215,  184,  153,  122,   91,   60,   29,  960,
+ 929,  898,  867,  836,  805,  774,  743,  712,  681,  650,  619,  588,  557,
+ 526,  495,  464,  433,  402,  371,  340,  309,  278,  247,  216,  185,  154,
+ 123,   92,   61,   30,  992,  961,  930,  899,  868,  837,  806,  775,  744,
+ 713,  682,  651,  620,  589,  558,  527,  496,  465,  434,  403,  372,  341,
+ 310,  279,  248,  217,  186,  155,  124,   93,   62,   31,  993,  962,  931,
+ 900,  869,  838,  807,  776,  745,  714,  683,  652,  621,  590,  559,  528,
+ 497,  466,  435,  404,  373,  342,  311,  280,  249,  218,  187,  156,  125,
+  94,   63,  994,  963,  932,  901,  870,  839,  808,  777,  746,  715,  684,
+ 653,  622,  591,  560,  529,  498,  467,  436,  405,  374,  343,  312,  281,
+ 250,  219,  188,  157,  126,   95,  995,  964,  933,  902,  871,  840,  809,
+ 778,  747,  716,  685,  654,  623,  592,  561,  530,  499,  468,  437,  406,
+ 375,  344,  313,  282,  251,  220,  189,  158,  127,  996,  965,  934,  903,
+ 872,  841,  810,  779,  748,  717,  686,  655,  624,  593,  562,  531,  500,
+ 469,  438,  407,  376,  345,  314,  283,  252,  221,  190,  159,  997,  966,
+ 935,  904,  873,  842,  811,  780,  749,  718,  687,  656,  625,  594,  563,
+ 532,  501,  470,  439,  408,  377,  346,  315,  284,  253,  222,  191,  998,
+ 967,  936,  905,  874,  843,  812,  781,  750,  719,  688,  657,  626,  595,
+ 564,  533,  502,  471,  440,  409,  378,  347,  316,  285,  254,  223,  999,
+ 968,  937,  906,  875,  844,  813,  782,  751,  720,  689,  658,  627,  596,
+ 565,  534,  503,  472,  441,  410,  379,  348,  317,  286,  255, 1000,  969,
+ 938,  907,  876,  845,  814,  783,  752,  721,  690,  659,  628,  597,  566,
+ 535,  504,  473,  442,  411,  380,  349,  318,  287, 1001,  970,  939,  908,
+ 877,  846,  815,  784,  753,  722,  691,  660,  629,  598,  567,  536,  505,
+ 474,  443,  412,  381,  350,  319, 1002,  971,  940,  909,  878,  847,  816,
+ 785,  754,  723,  692,  661,  630,  599,  568,  537,  506,  475,  444,  413,
+ 382,  351, 1003,  972,  941,  910,  879,  848,  817,  786,  755,  724,  693,
+ 662,  631,  600,  569,  538,  507,  476,  445,  414,  383, 1004,  973,  942,
+ 911,  880,  849,  818,  787,  756,  725,  694,  663,  632,  601,  570,  539,
+ 508,  477,  446,  415, 1005,  974,  943,  912,  881,  850,  819,  788,  757,
+ 726,  695,  664,  633,  602,  571,  540,  509,  478,  447, 1006,  975,  944,
+ 913,  882,  851,  820,  789,  758,  727,  696,  665,  634,  603,  572,  541,
+ 510,  479, 1007,  976,  945,  914,  883,  852,  821,  790,  759,  728,  697,
+ 666,  635,  604,  573,  542,  511, 1008,  977,  946,  915,  884,  853,  822,
+ 791,  760,  729,  698,  667,  636,  605,  574,  543, 1009,  978,  947,  916,
+ 885,  854,  823,  792,  761,  730,  699,  668,  637,  606,  575, 1010,  979,
+ 948,  917,  886,  855,  824,  793,  762,  731,  700,  669,  638,  607, 1011,
+ 980,  949,  918,  887,  856,  825,  794,  763,  732,  701,  670,  639, 1012,
+ 981,  950,  919,  888,  857,  826,  795,  764,  733,  702,  671, 1013,  982,
+ 951,  920,  889,  858,  827,  796,  765,  734,  703, 1014,  983,  952,  921,
+ 890,  859,  828,  797,  766,  735, 1015,  984,  953,  922,  891,  860,  829,
+ 798,  767, 1016,  985,  954,  923,  892,  861,  830,  799, 1017,  986,  955,
+ 924,  893,  862,  831, 1018,  987,  956,  925,  894,  863, 1019,  988,  957,
+ 926,  895, 1020,  989,  958,  927, 1021,  990,  959, 1022,  991, 1023,    0,
+  32,    1,   64,   33,    2,   96,   65,   34,    3,  128,   97,   66,   35,
+   4,  160,  129,   98,   67,   36,    5,  192,  161,  130,   99,   68,   37,
+   6,  224,  193,  162,  131,  100,   69,   38,    7,  256,  225,  194,  163,
+ 132,  101,   70,   39,    8,  288,  257,  226,  195,  164,  133,  102,   71,
+  40,    9,  320,  289,  258,  227,  196,  165,  134,  103,   72,   41,   10,
+ 352,  321,  290,  259,  228,  197,  166,  135,  104,   73,   42,   11,  384,
+ 353,  322,  291,  260,  229,  198,  167,  136,  105,   74,   43,   12,  416,
+ 385,  354,  323,  292,  261,  230,  199,  168,  137,  106,   75,   44,   13,
+ 448,  417,  386,  355,  324,  293,  262,  231,  200,  169,  138,  107,   76,
+  45,   14,  480,  449,  418,  387,  356,  325,  294,  263,  232,  201,  170,
+ 139,  108,   77,   46,   15,  512,  481,  450,  419,  388,  357,  326,  295,
+ 264,  233,  202,  171,  140,  109,   78,   47,   16,  544,  513,  482,  451,
+ 420,  389,  358,  327,  296,  265,  234,  203,  172,  141,  110,   79,   48,
+  17,  576,  545,  514,  483,  452,  421,  390,  359,  328,  297,  266,  235,
+ 204,  173,  142,  111,   80,   49,   18,  608,  577,  546,  515,  484,  453,
+ 422,  391,  360,  329,  298,  267,  236,  205,  174,  143,  112,   81,   50,
+  19,  640,  609,  578,  547,  516,  485,  454,  423,  392,  361,  330,  299,
+ 268,  237,  206,  175,  144,  113,   82,   51,   20,  672,  641,  610,  579,
+ 548,  517,  486,  455,  424,  393,  362,  331,  300,  269,  238,  207,  176,
+ 145,  114,   83,   52,   21,  704,  673,  642,  611,  580,  549,  518,  487,
+ 456,  425,  394,  363,  332,  301,  270,  239,  208,  177,  146,  115,   84,
+  53,   22,  736,  705,  674,  643,  612,  581,  550,  519,  488,  457,  426,
+ 395,  364,  333,  302,  271,  240,  209,  178,  147,  116,   85,   54,   23,
+ 768,  737,  706,  675,  644,  613,  582,  551,  520,  489,  458,  427,  396,
+ 365,  334,  303,  272,  241,  210,  179,  148,  117,   86,   55,   24,  800,
+ 769,  738,  707,  676,  645,  614,  583,  552,  521,  490,  459,  428,  397,
+ 366,  335,  304,  273,  242,  211,  180,  149,  118,   87,   56,   25,  832,
+ 801,  770,  739,  708,  677,  646,  615,  584,  553,  522,  491,  460,  429,
+ 398,  367,  336,  305,  274,  243,  212,  181,  150,  119,   88,   57,   26,
+ 864,  833,  802,  771,  740,  709,  678,  647,  616,  585,  554,  523,  492,
+ 461,  430,  399,  368,  337,  306,  275,  244,  213,  182,  151,  120,   89,
+  58,   27,  896,  865,  834,  803,  772,  741,  710,  679,  648,  617,  586,
+ 555,  524,  493,  462,  431,  400,  369,  338,  307,  276,  245,  214,  183,
+ 152,  121,   90,   59,   28,  928,  897,  866,  835,  804,  773,  742,  711,
+ 680,  649,  618,  587,  556,  525,  494,  463,  432,  401,  370,  339,  308,
+ 277,  246,  215,  184,  153,  122,   91,   60,   29,  960,  929,  898,  867,
+ 836,  805,  774,  743,  712,  681,  650,  619,  588,  557,  526,  495,  464,
+ 433,  402,  371,  340,  309,  278,  247,  216,  185,  154,  123,   92,   61,
+  30,  992,  961,  930,  899,  868,  837,  806,  775,  744,  713,  682,  651,
+ 620,  589,  558,  527,  496,  465,  434,  403,  372,  341,  310,  279,  248,
+ 217,  186,  155,  124,   93,   62,   31, 1024,  993,  962,  931,  900,  869,
+ 838,  807,  776,  745,  714,  683,  652,  621,  590,  559,  528,  497,  466,
+ 435,  404,  373,  342,  311,  280,  249,  218,  187,  156,  125,   94,   63,
+1056, 1025,  994,  963,  932,  901,  870,  839,  808,  777,  746,  715,  684,
+ 653,  622,  591,  560,  529,  498,  467,  436,  405,  374,  343,  312,  281,
+ 250,  219,  188,  157,  126,   95, 1088, 1057, 1026,  995,  964,  933,  902,
+ 871,  840,  809,  778,  747,  716,  685,  654,  623,  592,  561,  530,  499,
+ 468,  437,  406,  375,  344,  313,  282,  251,  220,  189,  158,  127, 1120,
+1089, 1058, 1027,  996,  965,  934,  903,  872,  841,  810,  779,  748,  717,
+ 686,  655,  624,  593,  562,  531,  500,  469,  438,  407,  376,  345,  314,
+ 283,  252,  221,  190,  159, 1152, 1121, 1090, 1059, 1028,  997,  966,  935,
+ 904,  873,  842,  811,  780,  749,  718,  687,  656,  625,  594,  563,  532,
+ 501,  470,  439,  408,  377,  346,  315,  284,  253,  222,  191, 1184, 1153,
+1122, 1091, 1060, 1029,  998,  967,  936,  905,  874,  843,  812,  781,  750,
+ 719,  688,  657,  626,  595,  564,  533,  502,  471,  440,  409,  378,  347,
+ 316,  285,  254,  223, 1216, 1185, 1154, 1123, 1092, 1061, 1030,  999,  968,
+ 937,  906,  875,  844,  813,  782,  751,  720,  689,  658,  627,  596,  565,
+ 534,  503,  472,  441,  410,  379,  348,  317,  286,  255, 1248, 1217, 1186,
+1155, 1124, 1093, 1062, 1031, 1000,  969,  938,  907,  876,  845,  814,  783,
+ 752,  721,  690,  659,  628,  597,  566,  535,  504,  473,  442,  411,  380,
+ 349,  318,  287, 1280, 1249, 1218, 1187, 1156, 1125, 1094, 1063, 1032, 1001,
+ 970,  939,  908,  877,  846,  815,  784,  753,  722,  691,  660,  629,  598,
+ 567,  536,  505,  474,  443,  412,  381,  350,  319, 1312, 1281, 1250, 1219,
+1188, 1157, 1126, 1095, 1064, 1033, 1002,  971,  940,  909,  878,  847,  816,
+ 785,  754,  723,  692,  661,  630,  599,  568,  537,  506,  475,  444,  413,
+ 382,  351, 1344, 1313, 1282, 1251, 1220, 1189, 1158, 1127, 1096, 1065, 1034,
+1003,  972,  941,  910,  879,  848,  817,  786,  755,  724,  693,  662,  631,
+ 600,  569,  538,  507,  476,  445,  414,  383, 1376, 1345, 1314, 1283, 1252,
+1221, 1190, 1159, 1128, 1097, 1066, 1035, 1004,  973,  942,  911,  880,  849,
+ 818,  787,  756,  725,  694,  663,  632,  601,  570,  539,  508,  477,  446,
+ 415, 1408, 1377, 1346, 1315, 1284, 1253, 1222, 1191, 1160, 1129, 1098, 1067,
+1036, 1005,  974,  943,  912,  881,  850,  819,  788,  757,  726,  695,  664,
+ 633,  602,  571,  540,  509,  478,  447, 1440, 1409, 1378, 1347, 1316, 1285,
+1254, 1223, 1192, 1161, 1130, 1099, 1068, 1037, 1006,  975,  944,  913,  882,
+ 851,  820,  789,  758,  727,  696,  665,  634,  603,  572,  541,  510,  479,
+1472, 1441, 1410, 1379, 1348, 1317, 1286, 1255, 1224, 1193, 1162, 1131, 1100,
+1069, 1038, 1007,  976,  945,  914,  883,  852,  821,  790,  759,  728,  697,
+ 666,  635,  604,  573,  542,  511, 1504, 1473, 1442, 1411, 1380, 1349, 1318,
+1287, 1256, 1225, 1194, 1163, 1132, 1101, 1070, 1039, 1008,  977,  946,  915,
+ 884,  853,  822,  791,  760,  729,  698,  667,  636,  605,  574,  543, 1536,
+1505, 1474, 1443, 1412, 1381, 1350, 1319, 1288, 1257, 1226, 1195, 1164, 1133,
+1102, 1071, 1040, 1009,  978,  947,  916,  885,  854,  823,  792,  761,  730,
+ 699,  668,  637,  606,  575, 1568, 1537, 1506, 1475, 1444, 1413, 1382, 1351,
+1320, 1289, 1258, 1227, 1196, 1165, 1134, 1103, 1072, 1041, 1010,  979,  948,
+ 917,  886,  855,  824,  793,  762,  731,  700,  669,  638,  607, 1600, 1569,
+1538, 1507, 1476, 1445, 1414, 1383, 1352, 1321, 1290, 1259, 1228, 1197, 1166,
+1135, 1104, 1073, 1042, 1011,  980,  949,  918,  887,  856,  825,  794,  763,
+ 732,  701,  670,  639, 1632, 1601, 1570, 1539, 1508, 1477, 1446, 1415, 1384,
+1353, 1322, 1291, 1260, 1229, 1198, 1167, 1136, 1105, 1074, 1043, 1012,  981,
+ 950,  919,  888,  857,  826,  795,  764,  733,  702,  671, 1664, 1633, 1602,
+1571, 1540, 1509, 1478, 1447, 1416, 1385, 1354, 1323, 1292, 1261, 1230, 1199,
+1168, 1137, 1106, 1075, 1044, 1013,  982,  951,  920,  889,  858,  827,  796,
+ 765,  734,  703, 1696, 1665, 1634, 1603, 1572, 1541, 1510, 1479, 1448, 1417,
+1386, 1355, 1324, 1293, 1262, 1231, 1200, 1169, 1138, 1107, 1076, 1045, 1014,
+ 983,  952,  921,  890,  859,  828,  797,  766,  735, 1728, 1697, 1666, 1635,
+1604, 1573, 1542, 1511, 1480, 1449, 1418, 1387, 1356, 1325, 1294, 1263, 1232,
+1201, 1170, 1139, 1108, 1077, 1046, 1015,  984,  953,  922,  891,  860,  829,
+ 798,  767, 1760, 1729, 1698, 1667, 1636, 1605, 1574, 1543, 1512, 1481, 1450,
+1419, 1388, 1357, 1326, 1295, 1264, 1233, 1202, 1171, 1140, 1109, 1078, 1047,
+1016,  985,  954,  923,  892,  861,  830,  799, 1792, 1761, 1730, 1699, 1668,
+1637, 1606, 1575, 1544, 1513, 1482, 1451, 1420, 1389, 1358, 1327, 1296, 1265,
+1234, 1203, 1172, 1141, 1110, 1079, 1048, 1017,  986,  955,  924,  893,  862,
+ 831, 1824, 1793, 1762, 1731, 1700, 1669, 1638, 1607, 1576, 1545, 1514, 1483,
+1452, 1421, 1390, 1359, 1328, 1297, 1266, 1235, 1204, 1173, 1142, 1111, 1080,
+1049, 1018,  987,  956,  925,  894,  863, 1856, 1825, 1794, 1763, 1732, 1701,
+1670, 1639, 1608, 1577, 1546, 1515, 1484, 1453, 1422, 1391, 1360, 1329, 1298,
+1267, 1236, 1205, 1174, 1143, 1112, 1081, 1050, 1019,  988,  957,  926,  895,
+1888, 1857, 1826, 1795, 1764, 1733, 1702, 1671, 1640, 1609, 1578, 1547, 1516,
+1485, 1454, 1423, 1392, 1361, 1330, 1299, 1268, 1237, 1206, 1175, 1144, 1113,
+1082, 1051, 1020,  989,  958,  927, 1920, 1889, 1858, 1827, 1796, 1765, 1734,
+1703, 1672, 1641, 1610, 1579, 1548, 1517, 1486, 1455, 1424, 1393, 1362, 1331,
+1300, 1269, 1238, 1207, 1176, 1145, 1114, 1083, 1052, 1021,  990,  959, 1952,
+1921, 1890, 1859, 1828, 1797, 1766, 1735, 1704, 1673, 1642, 1611, 1580, 1549,
+1518, 1487, 1456, 1425, 1394, 1363, 1332, 1301, 1270, 1239, 1208, 1177, 1146,
+1115, 1084, 1053, 1022,  991, 1984, 1953, 1922, 1891, 1860, 1829, 1798, 1767,
+1736, 1705, 1674, 1643, 1612, 1581, 1550, 1519, 1488, 1457, 1426, 1395, 1364,
+1333, 1302, 1271, 1240, 1209, 1178, 1147, 1116, 1085, 1054, 1023, 2016, 1985,
+1954, 1923, 1892, 1861, 1830, 1799, 1768, 1737, 1706, 1675, 1644, 1613, 1582,
+1551, 1520, 1489, 1458, 1427, 1396, 1365, 1334, 1303, 1272, 1241, 1210, 1179,
+1148, 1117, 1086, 1055, 2017, 1986, 1955, 1924, 1893, 1862, 1831, 1800, 1769,
+1738, 1707, 1676, 1645, 1614, 1583, 1552, 1521, 1490, 1459, 1428, 1397, 1366,
+1335, 1304, 1273, 1242, 1211, 1180, 1149, 1118, 1087, 2018, 1987, 1956, 1925,
+1894, 1863, 1832, 1801, 1770, 1739, 1708, 1677, 1646, 1615, 1584, 1553, 1522,
+1491, 1460, 1429, 1398, 1367, 1336, 1305, 1274, 1243, 1212, 1181, 1150, 1119,
+2019, 1988, 1957, 1926, 1895, 1864, 1833, 1802, 1771, 1740, 1709, 1678, 1647,
+1616, 1585, 1554, 1523, 1492, 1461, 1430, 1399, 1368, 1337, 1306, 1275, 1244,
+1213, 1182, 1151, 2020, 1989, 1958, 1927, 1896, 1865, 1834, 1803, 1772, 1741,
+1710, 1679, 1648, 1617, 1586, 1555, 1524, 1493, 1462, 1431, 1400, 1369, 1338,
+1307, 1276, 1245, 1214, 1183, 2021, 1990, 1959, 1928, 1897, 1866, 1835, 1804,
+1773, 1742, 1711, 1680, 1649, 1618, 1587, 1556, 1525, 1494, 1463, 1432, 1401,
+1370, 1339, 1308, 1277, 1246, 1215, 2022, 1991, 1960, 1929, 1898, 1867, 1836,
+1805, 1774, 1743, 1712, 1681, 1650, 1619, 1588, 1557, 1526, 1495, 1464, 1433,
+1402, 1371, 1340, 1309, 1278, 1247, 2023, 1992, 1961, 1930, 1899, 1868, 1837,
+1806, 1775, 1744, 1713, 1682, 1651, 1620, 1589, 1558, 1527, 1496, 1465, 1434,
+1403, 1372, 1341, 1310, 1279, 2024, 1993, 1962, 1931, 1900, 1869, 1838, 1807,
+1776, 1745, 1714, 1683, 1652, 1621, 1590, 1559, 1528, 1497, 1466, 1435, 1404,
+1373, 1342, 1311, 2025, 1994, 1963, 1932, 1901, 1870, 1839, 1808, 1777, 1746,
+1715, 1684, 1653, 1622, 1591, 1560, 1529, 1498, 1467, 1436, 1405, 1374, 1343,
+2026, 1995, 1964, 1933, 1902, 1871, 1840, 1809, 1778, 1747, 1716, 1685, 1654,
+1623, 1592, 1561, 1530, 1499, 1468, 1437, 1406, 1375, 2027, 1996, 1965, 1934,
+1903, 1872, 1841, 1810, 1779, 1748, 1717, 1686, 1655, 1624, 1593, 1562, 1531,
+1500, 1469, 1438, 1407, 2028, 1997, 1966, 1935, 1904, 1873, 1842, 1811, 1780,
+1749, 1718, 1687, 1656, 1625, 1594, 1563, 1532, 1501, 1470, 1439, 2029, 1998,
+1967, 1936, 1905, 1874, 1843, 1812, 1781, 1750, 1719, 1688, 1657, 1626, 1595,
+1564, 1533, 1502, 1471, 2030, 1999, 1968, 1937, 1906, 1875, 1844, 1813, 1782,
+1751, 1720, 1689, 1658, 1627, 1596, 1565, 1534, 1503, 2031, 2000, 1969, 1938,
+1907, 1876, 1845, 1814, 1783, 1752, 1721, 1690, 1659, 1628, 1597, 1566, 1535,
+2032, 2001, 1970, 1939, 1908, 1877, 1846, 1815, 1784, 1753, 1722, 1691, 1660,
+1629, 1598, 1567, 2033, 2002, 1971, 1940, 1909, 1878, 1847, 1816, 1785, 1754,
+1723, 1692, 1661, 1630, 1599, 2034, 2003, 1972, 1941, 1910, 1879, 1848, 1817,
+1786, 1755, 1724, 1693, 1662, 1631, 2035, 2004, 1973, 1942, 1911, 1880, 1849,
+1818, 1787, 1756, 1725, 1694, 1663, 2036, 2005, 1974, 1943, 1912, 1881, 1850,
+1819, 1788, 1757, 1726, 1695, 2037, 2006, 1975, 1944, 1913, 1882, 1851, 1820,
+1789, 1758, 1727, 2038, 2007, 1976, 1945, 1914, 1883, 1852, 1821, 1790, 1759,
+2039, 2008, 1977, 1946, 1915, 1884, 1853, 1822, 1791, 2040, 2009, 1978, 1947,
+1916, 1885, 1854, 1823, 2041, 2010, 1979, 1948, 1917, 1886, 1855, 2042, 2011,
+1980, 1949, 1918, 1887, 2043, 2012, 1981, 1950, 1919, 2044, 2013, 1982, 1951,
+2045, 2014, 1983, 2046, 2015, 2047,    0,    1,    2,    3,    4,    5,    6,
+   7,    8,    9,   10,   11,   12,   13,   14,   15,   16,   17,   18,   19,
+  20,   21,   22,   23,   24,   25,   26,   27,   28,   29,   30,   31,   32,
+  33,   34,   35,   36,   37,   38,   39,   40,   41,   42,   43,   44,   45,
+  46,   47,   48,   49,   50,   51,   52,   53,   54,   55,   56,   57,   58,
+  59,   60,   61,   62,   63,    0,   64,    1,   65,    2,   66,    3,   67,
+   4,   68,    5,   69,    6,   70,    7,   71,    8,   72,    9,   73,   10,
+  74,   11,   75,   12,   76,   13,   77,   14,   78,   15,   79,   16,   80,
+  17,   81,   18,   82,   19,   83,   20,   84,   21,   85,   22,   86,   23,
+  87,   24,   88,   25,   89,   26,   90,   27,   91,   28,   92,   29,   93,
+  30,   94,   31,   95,   32,   96,   33,   97,   34,   98,   35,   99,   36,
+ 100,   37,  101,   38,  102,   39,  103,   40,  104,   41,  105,   42,  106,
+  43,  107,   44,  108,   45,  109,   46,  110,   47,  111,   48,  112,   49,
+ 113,   50,  114,   51,  115,   52,  116,   53,  117,   54,  118,   55,  119,
+  56,  120,   57,  121,   58,  122,   59,  123,   60,  124,   61,  125,   62,
+ 126,   63,  127,    0,   64,    1,  128,   65,    2,  192,  129,   66,    3,
+ 193,  130,   67,    4,  194,  131,   68,    5,  195,  132,   69,    6,  196,
+ 133,   70,    7,  197,  134,   71,    8,  198,  135,   72,    9,  199,  136,
+  73,   10,  200,  137,   74,   11,  201,  138,   75,   12,  202,  139,   76,
+  13,  203,  140,   77,   14,  204,  141,   78,   15,  205,  142,   79,   16,
+ 206,  143,   80,   17,  207,  144,   81,   18,  208,  145,   82,   19,  209,
+ 146,   83,   20,  210,  147,   84,   21,  211,  148,   85,   22,  212,  149,
+  86,   23,  213,  150,   87,   24,  214,  151,   88,   25,  215,  152,   89,
+  26,  216,  153,   90,   27,  217,  154,   91,   28,  218,  155,   92,   29,
+ 219,  156,   93,   30,  220,  157,   94,   31,  221,  158,   95,   32,  222,
+ 159,   96,   33,  223,  160,   97,   34,  224,  161,   98,   35,  225,  162,
+  99,   36,  226,  163,  100,   37,  227,  164,  101,   38,  228,  165,  102,
+  39,  229,  166,  103,   40,  230,  167,  104,   41,  231,  168,  105,   42,
+ 232,  169,  106,   43,  233,  170,  107,   44,  234,  171,  108,   45,  235,
+ 172,  109,   46,  236,  173,  110,   47,  237,  174,  111,   48,  238,  175,
+ 112,   49,  239,  176,  113,   50,  240,  177,  114,   51,  241,  178,  115,
+  52,  242,  179,  116,   53,  243,  180,  117,   54,  244,  181,  118,   55,
+ 245,  182,  119,   56,  246,  183,  120,   57,  247,  184,  121,   58,  248,
+ 185,  122,   59,  249,  186,  123,   60,  250,  187,  124,   61,  251,  188,
+ 125,   62,  252,  189,  126,   63,  253,  190,  127,  254,  191,  255,    0,
+  64,    1,  128,   65,    2,  192,  129,   66,    3,  256,  193,  130,   67,
+   4,  320,  257,  194,  131,   68,    5,  384,  321,  258,  195,  132,   69,
+   6,  448,  385,  322,  259,  196,  133,   70,    7,  449,  386,  323,  260,
+ 197,  134,   71,    8,  450,  387,  324,  261,  198,  135,   72,    9,  451,
+ 388,  325,  262,  199,  136,   73,   10,  452,  389,  326,  263,  200,  137,
+  74,   11,  453,  390,  327,  264,  201,  138,   75,   12,  454,  391,  328,
+ 265,  202,  139,   76,   13,  455,  392,  329,  266,  203,  140,   77,   14,
+ 456,  393,  330,  267,  204,  141,   78,   15,  457,  394,  331,  268,  205,
+ 142,   79,   16,  458,  395,  332,  269,  206,  143,   80,   17,  459,  396,
+ 333,  270,  207,  144,   81,   18,  460,  397,  334,  271,  208,  145,   82,
+  19,  461,  398,  335,  272,  209,  146,   83,   20,  462,  399,  336,  273,
+ 210,  147,   84,   21,  463,  400,  337,  274,  211,  148,   85,   22,  464,
+ 401,  338,  275,  212,  149,   86,   23,  465,  402,  339,  276,  213,  150,
+  87,   24,  466,  403,  340,  277,  214,  151,   88,   25,  467,  404,  341,
+ 278,  215,  152,   89,   26,  468,  405,  342,  279,  216,  153,   90,   27,
+ 469,  406,  343,  280,  217,  154,   91,   28,  470,  407,  344,  281,  218,
+ 155,   92,   29,  471,  408,  345,  282,  219,  156,   93,   30,  472,  409,
+ 346,  283,  220,  157,   94,   31,  473,  410,  347,  284,  221,  158,   95,
+  32,  474,  411,  348,  285,  222,  159,   96,   33,  475,  412,  349,  286,
+ 223,  160,   97,   34,  476,  413,  350,  287,  224,  161,   98,   35,  477,
+ 414,  351,  288,  225,  162,   99,   36,  478,  415,  352,  289,  226,  163,
+ 100,   37,  479,  416,  353,  290,  227,  164,  101,   38,  480,  417,  354,
+ 291,  228,  165,  102,   39,  481,  418,  355,  292,  229,  166,  103,   40,
+ 482,  419,  356,  293,  230,  167,  104,   41,  483,  420,  357,  294,  231,
+ 168,  105,   42,  484,  421,  358,  295,  232,  169,  106,   43,  485,  422,
+ 359,  296,  233,  170,  107,   44,  486,  423,  360,  297,  234,  171,  108,
+  45,  487,  424,  361,  298,  235,  172,  109,   46,  488,  425,  362,  299,
+ 236,  173,  110,   47,  489,  426,  363,  300,  237,  174,  111,   48,  490,
+ 427,  364,  301,  238,  175,  112,   49,  491,  428,  365,  302,  239,  176,
+ 113,   50,  492,  429,  366,  303,  240,  177,  114,   51,  493,  430,  367,
+ 304,  241,  178,  115,   52,  494,  431,  368,  305,  242,  179,  116,   53,
+ 495,  432,  369,  306,  243,  180,  117,   54,  496,  433,  370,  307,  244,
+ 181,  118,   55,  497,  434,  371,  308,  245,  182,  119,   56,  498,  435,
+ 372,  309,  246,  183,  120,   57,  499,  436,  373,  310,  247,  184,  121,
+  58,  500,  437,  374,  311,  248,  185,  122,   59,  501,  438,  375,  312,
+ 249,  186,  123,   60,  502,  439,  376,  313,  250,  187,  124,   61,  503,
+ 440,  377,  314,  251,  188,  125,   62,  504,  441,  378,  315,  252,  189,
+ 126,   63,  505,  442,  379,  316,  253,  190,  127,  506,  443,  380,  317,
+ 254,  191,  507,  444,  381,  318,  255,  508,  445,  382,  319,  509,  446,
+ 383,  510,  447,  511,    0,   64,    1,  128,   65,    2,  192,  129,   66,
+   3,  256,  193,  130,   67,    4,  320,  257,  194,  131,   68,    5,  384,
+ 321,  258,  195,  132,   69,    6,  448,  385,  322,  259,  196,  133,   70,
+   7,  512,  449,  386,  323,  260,  197,  134,   71,    8,  576,  513,  450,
+ 387,  324,  261,  198,  135,   72,    9,  640,  577,  514,  451,  388,  325,
+ 262,  199,  136,   73,   10,  704,  641,  578,  515,  452,  389,  326,  263,
+ 200,  137,   74,   11,  768,  705,  642,  579,  516,  453,  390,  327,  264,
+ 201,  138,   75,   12,  832,  769,  706,  643,  580,  517,  454,  391,  328,
+ 265,  202,  139,   76,   13,  896,  833,  770,  707,  644,  581,  518,  455,
+ 392,  329,  266,  203,  140,   77,   14,  960,  897,  834,  771,  708,  645,
+ 582,  519,  456,  393,  330,  267,  204,  141,   78,   15,  961,  898,  835,
+ 772,  709,  646,  583,  520,  457,  394,  331,  268,  205,  142,   79,   16,
+ 962,  899,  836,  773,  710,  647,  584,  521,  458,  395,  332,  269,  206,
+ 143,   80,   17,  963,  900,  837,  774,  711,  648,  585,  522,  459,  396,
+ 333,  270,  207,  144,   81,   18,  964,  901,  838,  775,  712,  649,  586,
+ 523,  460,  397,  334,  271,  208,  145,   82,   19,  965,  902,  839,  776,
+ 713,  650,  587,  524,  461,  398,  335,  272,  209,  146,   83,   20,  966,
+ 903,  840,  777,  714,  651,  588,  525,  462,  399,  336,  273,  210,  147,
+  84,   21,  967,  904,  841,  778,  715,  652,  589,  526,  463,  400,  337,
+ 274,  211,  148,   85,   22,  968,  905,  842,  779,  716,  653,  590,  527,
+ 464,  401,  338,  275,  212,  149,   86,   23,  969,  906,  843,  780,  717,
+ 654,  591,  528,  465,  402,  339,  276,  213,  150,   87,   24,  970,  907,
+ 844,  781,  718,  655,  592,  529,  466,  403,  340,  277,  214,  151,   88,
+  25,  971,  908,  845,  782,  719,  656,  593,  530,  467,  404,  341,  278,
+ 215,  152,   89,   26,  972,  909,  846,  783,  720,  657,  594,  531,  468,
+ 405,  342,  279,  216,  153,   90,   27,  973,  910,  847,  784,  721,  658,
+ 595,  532,  469,  406,  343,  280,  217,  154,   91,   28,  974,  911,  848,
+ 785,  722,  659,  596,  533,  470,  407,  344,  281,  218,  155,   92,   29,
+ 975,  912,  849,  786,  723,  660,  597,  534,  471,  408,  345,  282,  219,
+ 156,   93,   30,  976,  913,  850,  787,  724,  661,  598,  535,  472,  409,
+ 346,  283,  220,  157,   94,   31,  977,  914,  851,  788,  725,  662,  599,
+ 536,  473,  410,  347,  284,  221,  158,   95,   32,  978,  915,  852,  789,
+ 726,  663,  600,  537,  474,  411,  348,  285,  222,  159,   96,   33,  979,
+ 916,  853,  790,  727,  664,  601,  538,  475,  412,  349,  286,  223,  160,
+  97,   34,  980,  917,  854,  791,  728,  665,  602,  539,  476,  413,  350,
+ 287,  224,  161,   98,   35,  981,  918,  855,  792,  729,  666,  603,  540,
+ 477,  414,  351,  288,  225,  162,   99,   36,  982,  919,  856,  793,  730,
+ 667,  604,  541,  478,  415,  352,  289,  226,  163,  100,   37,  983,  920,
+ 857,  794,  731,  668,  605,  542,  479,  416,  353,  290,  227,  164,  101,
+  38,  984,  921,  858,  795,  732,  669,  606,  543,  480,  417,  354,  291,
+ 228,  165,  102,   39,  985,  922,  859,  796,  733,  670,  607,  544,  481,
+ 418,  355,  292,  229,  166,  103,   40,  986,  923,  860,  797,  734,  671,
+ 608,  545,  482,  419,  356,  293,  230,  167,  104,   41,  987,  924,  861,
+ 798,  735,  672,  609,  546,  483,  420,  357,  294,  231,  168,  105,   42,
+ 988,  925,  862,  799,  736,  673,  610,  547,  484,  421,  358,  295,  232,
+ 169,  106,   43,  989,  926,  863,  800,  737,  674,  611,  548,  485,  422,
+ 359,  296,  233,  170,  107,   44,  990,  927,  864,  801,  738,  675,  612,
+ 549,  486,  423,  360,  297,  234,  171,  108,   45,  991,  928,  865,  802,
+ 739,  676,  613,  550,  487,  424,  361,  298,  235,  172,  109,   46,  992,
+ 929,  866,  803,  740,  677,  614,  551,  488,  425,  362,  299,  236,  173,
+ 110,   47,  993,  930,  867,  804,  741,  678,  615,  552,  489,  426,  363,
+ 300,  237,  174,  111,   48,  994,  931,  868,  805,  742,  679,  616,  553,
+ 490,  427,  364,  301,  238,  175,  112,   49,  995,  932,  869,  806,  743,
+ 680,  617,  554,  491,  428,  365,  302,  239,  176,  113,   50,  996,  933,
+ 870,  807,  744,  681,  618,  555,  492,  429,  366,  303,  240,  177,  114,
+  51,  997,  934,  871,  808,  745,  682,  619,  556,  493,  430,  367,  304,
+ 241,  178,  115,   52,  998,  935,  872,  809,  746,  683,  620,  557,  494,
+ 431,  368,  305,  242,  179,  116,   53,  999,  936,  873,  810,  747,  684,
+ 621,  558,  495,  432,  369,  306,  243,  180,  117,   54, 1000,  937,  874,
+ 811,  748,  685,  622,  559,  496,  433,  370,  307,  244,  181,  118,   55,
+1001,  938,  875,  812,  749,  686,  623,  560,  497,  434,  371,  308,  245,
+ 182,  119,   56, 1002,  939,  876,  813,  750,  687,  624,  561,  498,  435,
+ 372,  309,  246,  183,  120,   57, 1003,  940,  877,  814,  751,  688,  625,
+ 562,  499,  436,  373,  310,  247,  184,  121,   58, 1004,  941,  878,  815,
+ 752,  689,  626,  563,  500,  437,  374,  311,  248,  185,  122,   59, 1005,
+ 942,  879,  816,  753,  690,  627,  564,  501,  438,  375,  312,  249,  186,
+ 123,   60, 1006,  943,  880,  817,  754,  691,  628,  565,  502,  439,  376,
+ 313,  250,  187,  124,   61, 1007,  944,  881,  818,  755,  692,  629,  566,
+ 503,  440,  377,  314,  251,  188,  125,   62, 1008,  945,  882,  819,  756,
+ 693,  630,  567,  504,  441,  378,  315,  252,  189,  126,   63, 1009,  946,
+ 883,  820,  757,  694,  631,  568,  505,  442,  379,  316,  253,  190,  127,
+1010,  947,  884,  821,  758,  695,  632,  569,  506,  443,  380,  317,  254,
+ 191, 1011,  948,  885,  822,  759,  696,  633,  570,  507,  444,  381,  318,
+ 255, 1012,  949,  886,  823,  760,  697,  634,  571,  508,  445,  382,  319,
+1013,  950,  887,  824,  761,  698,  635,  572,  509,  446,  383, 1014,  951,
+ 888,  825,  762,  699,  636,  573,  510,  447, 1015,  952,  889,  826,  763,
+ 700,  637,  574,  511, 1016,  953,  890,  827,  764,  701,  638,  575, 1017,
+ 954,  891,  828,  765,  702,  639, 1018,  955,  892,  829,  766,  703, 1019,
+ 956,  893,  830,  767, 1020,  957,  894,  831, 1021,  958,  895, 1022,  959,
+1023,    0,   64,    1,  128,   65,    2,  192,  129,   66,    3,  256,  193,
+ 130,   67,    4,  320,  257,  194,  131,   68,    5,  384,  321,  258,  195,
+ 132,   69,    6,  448,  385,  322,  259,  196,  133,   70,    7,  512,  449,
+ 386,  323,  260,  197,  134,   71,    8,  576,  513,  450,  387,  324,  261,
+ 198,  135,   72,    9,  640,  577,  514,  451,  388,  325,  262,  199,  136,
+  73,   10,  704,  641,  578,  515,  452,  389,  326,  263,  200,  137,   74,
+  11,  768,  705,  642,  579,  516,  453,  390,  327,  264,  201,  138,   75,
+  12,  832,  769,  706,  643,  580,  517,  454,  391,  328,  265,  202,  139,
+  76,   13,  896,  833,  770,  707,  644,  581,  518,  455,  392,  329,  266,
+ 203,  140,   77,   14,  960,  897,  834,  771,  708,  645,  582,  519,  456,
+ 393,  330,  267,  204,  141,   78,   15, 1024,  961,  898,  835,  772,  709,
+ 646,  583,  520,  457,  394,  331,  268,  205,  142,   79,   16, 1088, 1025,
+ 962,  899,  836,  773,  710,  647,  584,  521,  458,  395,  332,  269,  206,
+ 143,   80,   17, 1152, 1089, 1026,  963,  900,  837,  774,  711,  648,  585,
+ 522,  459,  396,  333,  270,  207,  144,   81,   18, 1216, 1153, 1090, 1027,
+ 964,  901,  838,  775,  712,  649,  586,  523,  460,  397,  334,  271,  208,
+ 145,   82,   19, 1280, 1217, 1154, 1091, 1028,  965,  902,  839,  776,  713,
+ 650,  587,  524,  461,  398,  335,  272,  209,  146,   83,   20, 1344, 1281,
+1218, 1155, 1092, 1029,  966,  903,  840,  777,  714,  651,  588,  525,  462,
+ 399,  336,  273,  210,  147,   84,   21, 1408, 1345, 1282, 1219, 1156, 1093,
+1030,  967,  904,  841,  778,  715,  652,  589,  526,  463,  400,  337,  274,
+ 211,  148,   85,   22, 1472, 1409, 1346, 1283, 1220, 1157, 1094, 1031,  968,
+ 905,  842,  779,  716,  653,  590,  527,  464,  401,  338,  275,  212,  149,
+  86,   23, 1536, 1473, 1410, 1347, 1284, 1221, 1158, 1095, 1032,  969,  906,
+ 843,  780,  717,  654,  591,  528,  465,  402,  339,  276,  213,  150,   87,
+  24, 1600, 1537, 1474, 1411, 1348, 1285, 1222, 1159, 1096, 1033,  970,  907,
+ 844,  781,  718,  655,  592,  529,  466,  403,  340,  277,  214,  151,   88,
+  25, 1664, 1601, 1538, 1475, 1412, 1349, 1286, 1223, 1160, 1097, 1034,  971,
+ 908,  845,  782,  719,  656,  593,  530,  467,  404,  341,  278,  215,  152,
+  89,   26, 1728, 1665, 1602, 1539, 1476, 1413, 1350, 1287, 1224, 1161, 1098,
+1035,  972,  909,  846,  783,  720,  657,  594,  531,  468,  405,  342,  279,
+ 216,  153,   90,   27, 1792, 1729, 1666, 1603, 1540, 1477, 1414, 1351, 1288,
+1225, 1162, 1099, 1036,  973,  910,  847,  784,  721,  658,  595,  532,  469,
+ 406,  343,  280,  217,  154,   91,   28, 1856, 1793, 1730, 1667, 1604, 1541,
+1478, 1415, 1352, 1289, 1226, 1163, 1100, 1037,  974,  911,  848,  785,  722,
+ 659,  596,  533,  470,  407,  344,  281,  218,  155,   92,   29, 1920, 1857,
+1794, 1731, 1668, 1605, 1542, 1479, 1416, 1353, 1290, 1227, 1164, 1101, 1038,
+ 975,  912,  849,  786,  723,  660,  597,  534,  471,  408,  345,  282,  219,
+ 156,   93,   30, 1984, 1921, 1858, 1795, 1732, 1669, 1606, 1543, 1480, 1417,
+1354, 1291, 1228, 1165, 1102, 1039,  976,  913,  850,  787,  724,  661,  598,
+ 535,  472,  409,  346,  283,  220,  157,   94,   31, 1985, 1922, 1859, 1796,
+1733, 1670, 1607, 1544, 1481, 1418, 1355, 1292, 1229, 1166, 1103, 1040,  977,
+ 914,  851,  788,  725,  662,  599,  536,  473,  410,  347,  284,  221,  158,
+  95,   32, 1986, 1923, 1860, 1797, 1734, 1671, 1608, 1545, 1482, 1419, 1356,
+1293, 1230, 1167, 1104, 1041,  978,  915,  852,  789,  726,  663,  600,  537,
+ 474,  411,  348,  285,  222,  159,   96,   33, 1987, 1924, 1861, 1798, 1735,
+1672, 1609, 1546, 1483, 1420, 1357, 1294, 1231, 1168, 1105, 1042,  979,  916,
+ 853,  790,  727,  664,  601,  538,  475,  412,  349,  286,  223,  160,   97,
+  34, 1988, 1925, 1862, 1799, 1736, 1673, 1610, 1547, 1484, 1421, 1358, 1295,
+1232, 1169, 1106, 1043,  980,  917,  854,  791,  728,  665,  602,  539,  476,
+ 413,  350,  287,  224,  161,   98,   35, 1989, 1926, 1863, 1800, 1737, 1674,
+1611, 1548, 1485, 1422, 1359, 1296, 1233, 1170, 1107, 1044,  981,  918,  855,
+ 792,  729,  666,  603,  540,  477,  414,  351,  288,  225,  162,   99,   36,
+1990, 1927, 1864, 1801, 1738, 1675, 1612, 1549, 1486, 1423, 1360, 1297, 1234,
+1171, 1108, 1045,  982,  919,  856,  793,  730,  667,  604,  541,  478,  415,
+ 352,  289,  226,  163,  100,   37, 1991, 1928, 1865, 1802, 1739, 1676, 1613,
+1550, 1487, 1424, 1361, 1298, 1235, 1172, 1109, 1046,  983,  920,  857,  794,
+ 731,  668,  605,  542,  479,  416,  353,  290,  227,  164,  101,   38, 1992,
+1929, 1866, 1803, 1740, 1677, 1614, 1551, 1488, 1425, 1362, 1299, 1236, 1173,
+1110, 1047,  984,  921,  858,  795,  732,  669,  606,  543,  480,  417,  354,
+ 291,  228,  165,  102,   39, 1993, 1930, 1867, 1804, 1741, 1678, 1615, 1552,
+1489, 1426, 1363, 1300, 1237, 1174, 1111, 1048,  985,  922,  859,  796,  733,
+ 670,  607,  544,  481,  418,  355,  292,  229,  166,  103,   40, 1994, 1931,
+1868, 1805, 1742, 1679, 1616, 1553, 1490, 1427, 1364, 1301, 1238, 1175, 1112,
+1049,  986,  923,  860,  797,  734,  671,  608,  545,  482,  419,  356,  293,
+ 230,  167,  104,   41, 1995, 1932, 1869, 1806, 1743, 1680, 1617, 1554, 1491,
+1428, 1365, 1302, 1239, 1176, 1113, 1050,  987,  924,  861,  798,  735,  672,
+ 609,  546,  483,  420,  357,  294,  231,  168,  105,   42, 1996, 1933, 1870,
+1807, 1744, 1681, 1618, 1555, 1492, 1429, 1366, 1303, 1240, 1177, 1114, 1051,
+ 988,  925,  862,  799,  736,  673,  610,  547,  484,  421,  358,  295,  232,
+ 169,  106,   43, 1997, 1934, 1871, 1808, 1745, 1682, 1619, 1556, 1493, 1430,
+1367, 1304, 1241, 1178, 1115, 1052,  989,  926,  863,  800,  737,  674,  611,
+ 548,  485,  422,  359,  296,  233,  170,  107,   44, 1998, 1935, 1872, 1809,
+1746, 1683, 1620, 1557, 1494, 1431, 1368, 1305, 1242, 1179, 1116, 1053,  990,
+ 927,  864,  801,  738,  675,  612,  549,  486,  423,  360,  297,  234,  171,
+ 108,   45, 1999, 1936, 1873, 1810, 1747, 1684, 1621, 1558, 1495, 1432, 1369,
+1306, 1243, 1180, 1117, 1054,  991,  928,  865,  802,  739,  676,  613,  550,
+ 487,  424,  361,  298,  235,  172,  109,   46, 2000, 1937, 1874, 1811, 1748,
+1685, 1622, 1559, 1496, 1433, 1370, 1307, 1244, 1181, 1118, 1055,  992,  929,
+ 866,  803,  740,  677,  614,  551,  488,  425,  362,  299,  236,  173,  110,
+  47, 2001, 1938, 1875, 1812, 1749, 1686, 1623, 1560, 1497, 1434, 1371, 1308,
+1245, 1182, 1119, 1056,  993,  930,  867,  804,  741,  678,  615,  552,  489,
+ 426,  363,  300,  237,  174,  111,   48, 2002, 1939, 1876, 1813, 1750, 1687,
+1624, 1561, 1498, 1435, 1372, 1309, 1246, 1183, 1120, 1057,  994,  931,  868,
+ 805,  742,  679,  616,  553,  490,  427,  364,  301,  238,  175,  112,   49,
+2003, 1940, 1877, 1814, 1751, 1688, 1625, 1562, 1499, 1436, 1373, 1310, 1247,
+1184, 1121, 1058,  995,  932,  869,  806,  743,  680,  617,  554,  491,  428,
+ 365,  302,  239,  176,  113,   50, 2004, 1941, 1878, 1815, 1752, 1689, 1626,
+1563, 1500, 1437, 1374, 1311, 1248, 1185, 1122, 1059,  996,  933,  870,  807,
+ 744,  681,  618,  555,  492,  429,  366,  303,  240,  177,  114,   51, 2005,
+1942, 1879, 1816, 1753, 1690, 1627, 1564, 1501, 1438, 1375, 1312, 1249, 1186,
+1123, 1060,  997,  934,  871,  808,  745,  682,  619,  556,  493,  430,  367,
+ 304,  241,  178,  115,   52, 2006, 1943, 1880, 1817, 1754, 1691, 1628, 1565,
+1502, 1439, 1376, 1313, 1250, 1187, 1124, 1061,  998,  935,  872,  809,  746,
+ 683,  620,  557,  494,  431,  368,  305,  242,  179,  116,   53, 2007, 1944,
+1881, 1818, 1755, 1692, 1629, 1566, 1503, 1440, 1377, 1314, 1251, 1188, 1125,
+1062,  999,  936,  873,  810,  747,  684,  621,  558,  495,  432,  369,  306,
+ 243,  180,  117,   54, 2008, 1945, 1882, 1819, 1756, 1693, 1630, 1567, 1504,
+1441, 1378, 1315, 1252, 1189, 1126, 1063, 1000,  937,  874,  811,  748,  685,
+ 622,  559,  496,  433,  370,  307,  244,  181,  118,   55, 2009, 1946, 1883,
+1820, 1757, 1694, 1631, 1568, 1505, 1442, 1379, 1316, 1253, 1190, 1127, 1064,
+1001,  938,  875,  812,  749,  686,  623,  560,  497,  434,  371,  308,  245,
+ 182,  119,   56, 2010, 1947, 1884, 1821, 1758, 1695, 1632, 1569, 1506, 1443,
+1380, 1317, 1254, 1191, 1128, 1065, 1002,  939,  876,  813,  750,  687,  624,
+ 561,  498,  435,  372,  309,  246,  183,  120,   57, 2011, 1948, 1885, 1822,
+1759, 1696, 1633, 1570, 1507, 1444, 1381, 1318, 1255, 1192, 1129, 1066, 1003,
+ 940,  877,  814,  751,  688,  625,  562,  499,  436,  373,  310,  247,  184,
+ 121,   58, 2012, 1949, 1886, 1823, 1760, 1697, 1634, 1571, 1508, 1445, 1382,
+1319, 1256, 1193, 1130, 1067, 1004,  941,  878,  815,  752,  689,  626,  563,
+ 500,  437,  374,  311,  248,  185,  122,   59, 2013, 1950, 1887, 1824, 1761,
+1698, 1635, 1572, 1509, 1446, 1383, 1320, 1257, 1194, 1131, 1068, 1005,  942,
+ 879,  816,  753,  690,  627,  564,  501,  438,  375,  312,  249,  186,  123,
+  60, 2014, 1951, 1888, 1825, 1762, 1699, 1636, 1573, 1510, 1447, 1384, 1321,
+1258, 1195, 1132, 1069, 1006,  943,  880,  817,  754,  691,  628,  565,  502,
+ 439,  376,  313,  250,  187,  124,   61, 2015, 1952, 1889, 1826, 1763, 1700,
+1637, 1574, 1511, 1448, 1385, 1322, 1259, 1196, 1133, 1070, 1007,  944,  881,
+ 818,  755,  692,  629,  566,  503,  440,  377,  314,  251,  188,  125,   62,
+2016, 1953, 1890, 1827, 1764, 1701, 1638, 1575, 1512, 1449, 1386, 1323, 1260,
+1197, 1134, 1071, 1008,  945,  882,  819,  756,  693,  630,  567,  504,  441,
+ 378,  315,  252,  189,  126,   63, 2017, 1954, 1891, 1828, 1765, 1702, 1639,
+1576, 1513, 1450, 1387, 1324, 1261, 1198, 1135, 1072, 1009,  946,  883,  820,
+ 757,  694,  631,  568,  505,  442,  379,  316,  253,  190,  127, 2018, 1955,
+1892, 1829, 1766, 1703, 1640, 1577, 1514, 1451, 1388, 1325, 1262, 1199, 1136,
+1073, 1010,  947,  884,  821,  758,  695,  632,  569,  506,  443,  380,  317,
+ 254,  191, 2019, 1956, 1893, 1830, 1767, 1704, 1641, 1578, 1515, 1452, 1389,
+1326, 1263, 1200, 1137, 1074, 1011,  948,  885,  822,  759,  696,  633,  570,
+ 507,  444,  381,  318,  255, 2020, 1957, 1894, 1831, 1768, 1705, 1642, 1579,
+1516, 1453, 1390, 1327, 1264, 1201, 1138, 1075, 1012,  949,  886,  823,  760,
+ 697,  634,  571,  508,  445,  382,  319, 2021, 1958, 1895, 1832, 1769, 1706,
+1643, 1580, 1517, 1454, 1391, 1328, 1265, 1202, 1139, 1076, 1013,  950,  887,
+ 824,  761,  698,  635,  572,  509,  446,  383, 2022, 1959, 1896, 1833, 1770,
+1707, 1644, 1581, 1518, 1455, 1392, 1329, 1266, 1203, 1140, 1077, 1014,  951,
+ 888,  825,  762,  699,  636,  573,  510,  447, 2023, 1960, 1897, 1834, 1771,
+1708, 1645, 1582, 1519, 1456, 1393, 1330, 1267, 1204, 1141, 1078, 1015,  952,
+ 889,  826,  763,  700,  637,  574,  511, 2024, 1961, 1898, 1835, 1772, 1709,
+1646, 1583, 1520, 1457, 1394, 1331, 1268, 1205, 1142, 1079, 1016,  953,  890,
+ 827,  764,  701,  638,  575, 2025, 1962, 1899, 1836, 1773, 1710, 1647, 1584,
+1521, 1458, 1395, 1332, 1269, 1206, 1143, 1080, 1017,  954,  891,  828,  765,
+ 702,  639, 2026, 1963, 1900, 1837, 1774, 1711, 1648, 1585, 1522, 1459, 1396,
+1333, 1270, 1207, 1144, 1081, 1018,  955,  892,  829,  766,  703, 2027, 1964,
+1901, 1838, 1775, 1712, 1649, 1586, 1523, 1460, 1397, 1334, 1271, 1208, 1145,
+1082, 1019,  956,  893,  830,  767, 2028, 1965, 1902, 1839, 1776, 1713, 1650,
+1587, 1524, 1461, 1398, 1335, 1272, 1209, 1146, 1083, 1020,  957,  894,  831,
+2029, 1966, 1903, 1840, 1777, 1714, 1651, 1588, 1525, 1462, 1399, 1336, 1273,
+1210, 1147, 1084, 1021,  958,  895, 2030, 1967, 1904, 1841, 1778, 1715, 1652,
+1589, 1526, 1463, 1400, 1337, 1274, 1211, 1148, 1085, 1022,  959, 2031, 1968,
+1905, 1842, 1779, 1716, 1653, 1590, 1527, 1464, 1401, 1338, 1275, 1212, 1149,
+1086, 1023, 2032, 1969, 1906, 1843, 1780, 1717, 1654, 1591, 1528, 1465, 1402,
+1339, 1276, 1213, 1150, 1087, 2033, 1970, 1907, 1844, 1781, 1718, 1655, 1592,
+1529, 1466, 1403, 1340, 1277, 1214, 1151, 2034, 1971, 1908, 1845, 1782, 1719,
+1656, 1593, 1530, 1467, 1404, 1341, 1278, 1215, 2035, 1972, 1909, 1846, 1783,
+1720, 1657, 1594, 1531, 1468, 1405, 1342, 1279, 2036, 1973, 1910, 1847, 1784,
+1721, 1658, 1595, 1532, 1469, 1406, 1343, 2037, 1974, 1911, 1848, 1785, 1722,
+1659, 1596, 1533, 1470, 1407, 2038, 1975, 1912, 1849, 1786, 1723, 1660, 1597,
+1534, 1471, 2039, 1976, 1913, 1850, 1787, 1724, 1661, 1598, 1535, 2040, 1977,
+1914, 1851, 1788, 1725, 1662, 1599, 2041, 1978, 1915, 1852, 1789, 1726, 1663,
+2042, 1979, 1916, 1853, 1790, 1727, 2043, 1980, 1917, 1854, 1791, 2044, 1981,
+1918, 1855, 2045, 1982, 1919, 2046, 1983, 2047,    0,   64,    1,  128,   65,
+   2,  192,  129,   66,    3,  256,  193,  130,   67,    4,  320,  257,  194,
+ 131,   68,    5,  384,  321,  258,  195,  132,   69,    6,  448,  385,  322,
+ 259,  196,  133,   70,    7,  512,  449,  386,  323,  260,  197,  134,   71,
+   8,  576,  513,  450,  387,  324,  261,  198,  135,   72,    9,  640,  577,
+ 514,  451,  388,  325,  262,  199,  136,   73,   10,  704,  641,  578,  515,
+ 452,  389,  326,  263,  200,  137,   74,   11,  768,  705,  642,  579,  516,
+ 453,  390,  327,  264,  201,  138,   75,   12,  832,  769,  706,  643,  580,
+ 517,  454,  391,  328,  265,  202,  139,   76,   13,  896,  833,  770,  707,
+ 644,  581,  518,  455,  392,  329,  266,  203,  140,   77,   14,  960,  897,
+ 834,  771,  708,  645,  582,  519,  456,  393,  330,  267,  204,  141,   78,
+  15, 1024,  961,  898,  835,  772,  709,  646,  583,  520,  457,  394,  331,
+ 268,  205,  142,   79,   16, 1088, 1025,  962,  899,  836,  773,  710,  647,
+ 584,  521,  458,  395,  332,  269,  206,  143,   80,   17, 1152, 1089, 1026,
+ 963,  900,  837,  774,  711,  648,  585,  522,  459,  396,  333,  270,  207,
+ 144,   81,   18, 1216, 1153, 1090, 1027,  964,  901,  838,  775,  712,  649,
+ 586,  523,  460,  397,  334,  271,  208,  145,   82,   19, 1280, 1217, 1154,
+1091, 1028,  965,  902,  839,  776,  713,  650,  587,  524,  461,  398,  335,
+ 272,  209,  146,   83,   20, 1344, 1281, 1218, 1155, 1092, 1029,  966,  903,
+ 840,  777,  714,  651,  588,  525,  462,  399,  336,  273,  210,  147,   84,
+  21, 1408, 1345, 1282, 1219, 1156, 1093, 1030,  967,  904,  841,  778,  715,
+ 652,  589,  526,  463,  400,  337,  274,  211,  148,   85,   22, 1472, 1409,
+1346, 1283, 1220, 1157, 1094, 1031,  968,  905,  842,  779,  716,  653,  590,
+ 527,  464,  401,  338,  275,  212,  149,   86,   23, 1536, 1473, 1410, 1347,
+1284, 1221, 1158, 1095, 1032,  969,  906,  843,  780,  717,  654,  591,  528,
+ 465,  402,  339,  276,  213,  150,   87,   24, 1600, 1537, 1474, 1411, 1348,
+1285, 1222, 1159, 1096, 1033,  970,  907,  844,  781,  718,  655,  592,  529,
+ 466,  403,  340,  277,  214,  151,   88,   25, 1664, 1601, 1538, 1475, 1412,
+1349, 1286, 1223, 1160, 1097, 1034,  971,  908,  845,  782,  719,  656,  593,
+ 530,  467,  404,  341,  278,  215,  152,   89,   26, 1728, 1665, 1602, 1539,
+1476, 1413, 1350, 1287, 1224, 1161, 1098, 1035,  972,  909,  846,  783,  720,
+ 657,  594,  531,  468,  405,  342,  279,  216,  153,   90,   27, 1792, 1729,
+1666, 1603, 1540, 1477, 1414, 1351, 1288, 1225, 1162, 1099, 1036,  973,  910,
+ 847,  784,  721,  658,  595,  532,  469,  406,  343,  280,  217,  154,   91,
+  28, 1856, 1793, 1730, 1667, 1604, 1541, 1478, 1415, 1352, 1289, 1226, 1163,
+1100, 1037,  974,  911,  848,  785,  722,  659,  596,  533,  470,  407,  344,
+ 281,  218,  155,   92,   29, 1920, 1857, 1794, 1731, 1668, 1605, 1542, 1479,
+1416, 1353, 1290, 1227, 1164, 1101, 1038,  975,  912,  849,  786,  723,  660,
+ 597,  534,  471,  408,  345,  282,  219,  156,   93,   30, 1984, 1921, 1858,
+1795, 1732, 1669, 1606, 1543, 1480, 1417, 1354, 1291, 1228, 1165, 1102, 1039,
+ 976,  913,  850,  787,  724,  661,  598,  535,  472,  409,  346,  283,  220,
+ 157,   94,   31, 2048, 1985, 1922, 1859, 1796, 1733, 1670, 1607, 1544, 1481,
+1418, 1355, 1292, 1229, 1166, 1103, 1040,  977,  914,  851,  788,  725,  662,
+ 599,  536,  473,  410,  347,  284,  221,  158,   95,   32, 2112, 2049, 1986,
+1923, 1860, 1797, 1734, 1671, 1608, 1545, 1482, 1419, 1356, 1293, 1230, 1167,
+1104, 1041,  978,  915,  852,  789,  726,  663,  600,  537,  474,  411,  348,
+ 285,  222,  159,   96,   33, 2176, 2113, 2050, 1987, 1924, 1861, 1798, 1735,
+1672, 1609, 1546, 1483, 1420, 1357, 1294, 1231, 1168, 1105, 1042,  979,  916,
+ 853,  790,  727,  664,  601,  538,  475,  412,  349,  286,  223,  160,   97,
+  34, 2240, 2177, 2114, 2051, 1988, 1925, 1862, 1799, 1736, 1673, 1610, 1547,
+1484, 1421, 1358, 1295, 1232, 1169, 1106, 1043,  980,  917,  854,  791,  728,
+ 665,  602,  539,  476,  413,  350,  287,  224,  161,   98,   35, 2304, 2241,
+2178, 2115, 2052, 1989, 1926, 1863, 1800, 1737, 1674, 1611, 1548, 1485, 1422,
+1359, 1296, 1233, 1170, 1107, 1044,  981,  918,  855,  792,  729,  666,  603,
+ 540,  477,  414,  351,  288,  225,  162,   99,   36, 2368, 2305, 2242, 2179,
+2116, 2053, 1990, 1927, 1864, 1801, 1738, 1675, 1612, 1549, 1486, 1423, 1360,
+1297, 1234, 1171, 1108, 1045,  982,  919,  856,  793,  730,  667,  604,  541,
+ 478,  415,  352,  289,  226,  163,  100,   37, 2432, 2369, 2306, 2243, 2180,
+2117, 2054, 1991, 1928, 1865, 1802, 1739, 1676, 1613, 1550, 1487, 1424, 1361,
+1298, 1235, 1172, 1109, 1046,  983,  920,  857,  794,  731,  668,  605,  542,
+ 479,  416,  353,  290,  227,  164,  101,   38, 2496, 2433, 2370, 2307, 2244,
+2181, 2118, 2055, 1992, 1929, 1866, 1803, 1740, 1677, 1614, 1551, 1488, 1425,
+1362, 1299, 1236, 1173, 1110, 1047,  984,  921,  858,  795,  732,  669,  606,
+ 543,  480,  417,  354,  291,  228,  165,  102,   39, 2560, 2497, 2434, 2371,
+2308, 2245, 2182, 2119, 2056, 1993, 1930, 1867, 1804, 1741, 1678, 1615, 1552,
+1489, 1426, 1363, 1300, 1237, 1174, 1111, 1048,  985,  922,  859,  796,  733,
+ 670,  607,  544,  481,  418,  355,  292,  229,  166,  103,   40, 2624, 2561,
+2498, 2435, 2372, 2309, 2246, 2183, 2120, 2057, 1994, 1931, 1868, 1805, 1742,
+1679, 1616, 1553, 1490, 1427, 1364, 1301, 1238, 1175, 1112, 1049,  986,  923,
+ 860,  797,  734,  671,  608,  545,  482,  419,  356,  293,  230,  167,  104,
+  41, 2688, 2625, 2562, 2499, 2436, 2373, 2310, 2247, 2184, 2121, 2058, 1995,
+1932, 1869, 1806, 1743, 1680, 1617, 1554, 1491, 1428, 1365, 1302, 1239, 1176,
+1113, 1050,  987,  924,  861,  798,  735,  672,  609,  546,  483,  420,  357,
+ 294,  231,  168,  105,   42, 2752, 2689, 2626, 2563, 2500, 2437, 2374, 2311,
+2248, 2185, 2122, 2059, 1996, 1933, 1870, 1807, 1744, 1681, 1618, 1555, 1492,
+1429, 1366, 1303, 1240, 1177, 1114, 1051,  988,  925,  862,  799,  736,  673,
+ 610,  547,  484,  421,  358,  295,  232,  169,  106,   43, 2816, 2753, 2690,
+2627, 2564, 2501, 2438, 2375, 2312, 2249, 2186, 2123, 2060, 1997, 1934, 1871,
+1808, 1745, 1682, 1619, 1556, 1493, 1430, 1367, 1304, 1241, 1178, 1115, 1052,
+ 989,  926,  863,  800,  737,  674,  611,  548,  485,  422,  359,  296,  233,
+ 170,  107,   44, 2880, 2817, 2754, 2691, 2628, 2565, 2502, 2439, 2376, 2313,
+2250, 2187, 2124, 2061, 1998, 1935, 1872, 1809, 1746, 1683, 1620, 1557, 1494,
+1431, 1368, 1305, 1242, 1179, 1116, 1053,  990,  927,  864,  801,  738,  675,
+ 612,  549,  486,  423,  360,  297,  234,  171,  108,   45, 2944, 2881, 2818,
+2755, 2692, 2629, 2566, 2503, 2440, 2377, 2314, 2251, 2188, 2125, 2062, 1999,
+1936, 1873, 1810, 1747, 1684, 1621, 1558, 1495, 1432, 1369, 1306, 1243, 1180,
+1117, 1054,  991,  928,  865,  802,  739,  676,  613,  550,  487,  424,  361,
+ 298,  235,  172,  109,   46, 3008, 2945, 2882, 2819, 2756, 2693, 2630, 2567,
+2504, 2441, 2378, 2315, 2252, 2189, 2126, 2063, 2000, 1937, 1874, 1811, 1748,
+1685, 1622, 1559, 1496, 1433, 1370, 1307, 1244, 1181, 1118, 1055,  992,  929,
+ 866,  803,  740,  677,  614,  551,  488,  425,  362,  299,  236,  173,  110,
+  47, 3072, 3009, 2946, 2883, 2820, 2757, 2694, 2631, 2568, 2505, 2442, 2379,
+2316, 2253, 2190, 2127, 2064, 2001, 1938, 1875, 1812, 1749, 1686, 1623, 1560,
+1497, 1434, 1371, 1308, 1245, 1182, 1119, 1056,  993,  930,  867,  804,  741,
+ 678,  615,  552,  489,  426,  363,  300,  237,  174,  111,   48, 3136, 3073,
+3010, 2947, 2884, 2821, 2758, 2695, 2632, 2569, 2506, 2443, 2380, 2317, 2254,
+2191, 2128, 2065, 2002, 1939, 1876, 1813, 1750, 1687, 1624, 1561, 1498, 1435,
+1372, 1309, 1246, 1183, 1120, 1057,  994,  931,  868,  805,  742,  679,  616,
+ 553,  490,  427,  364,  301,  238,  175,  112,   49, 3200, 3137, 3074, 3011,
+2948, 2885, 2822, 2759, 2696, 2633, 2570, 2507, 2444, 2381, 2318, 2255, 2192,
+2129, 2066, 2003, 1940, 1877, 1814, 1751, 1688, 1625, 1562, 1499, 1436, 1373,
+1310, 1247, 1184, 1121, 1058,  995,  932,  869,  806,  743,  680,  617,  554,
+ 491,  428,  365,  302,  239,  176,  113,   50, 3264, 3201, 3138, 3075, 3012,
+2949, 2886, 2823, 2760, 2697, 2634, 2571, 2508, 2445, 2382, 2319, 2256, 2193,
+2130, 2067, 2004, 1941, 1878, 1815, 1752, 1689, 1626, 1563, 1500, 1437, 1374,
+1311, 1248, 1185, 1122, 1059,  996,  933,  870,  807,  744,  681,  618,  555,
+ 492,  429,  366,  303,  240,  177,  114,   51, 3328, 3265, 3202, 3139, 3076,
+3013, 2950, 2887, 2824, 2761, 2698, 2635, 2572, 2509, 2446, 2383, 2320, 2257,
+2194, 2131, 2068, 2005, 1942, 1879, 1816, 1753, 1690, 1627, 1564, 1501, 1438,
+1375, 1312, 1249, 1186, 1123, 1060,  997,  934,  871,  808,  745,  682,  619,
+ 556,  493,  430,  367,  304,  241,  178,  115,   52, 3392, 3329, 3266, 3203,
+3140, 3077, 3014, 2951, 2888, 2825, 2762, 2699, 2636, 2573, 2510, 2447, 2384,
+2321, 2258, 2195, 2132, 2069, 2006, 1943, 1880, 1817, 1754, 1691, 1628, 1565,
+1502, 1439, 1376, 1313, 1250, 1187, 1124, 1061,  998,  935,  872,  809,  746,
+ 683,  620,  557,  494,  431,  368,  305,  242,  179,  116,   53, 3456, 3393,
+3330, 3267, 3204, 3141, 3078, 3015, 2952, 2889, 2826, 2763, 2700, 2637, 2574,
+2511, 2448, 2385, 2322, 2259, 2196, 2133, 2070, 2007, 1944, 1881, 1818, 1755,
+1692, 1629, 1566, 1503, 1440, 1377, 1314, 1251, 1188, 1125, 1062,  999,  936,
+ 873,  810,  747,  684,  621,  558,  495,  432,  369,  306,  243,  180,  117,
+  54, 3520, 3457, 3394, 3331, 3268, 3205, 3142, 3079, 3016, 2953, 2890, 2827,
+2764, 2701, 2638, 2575, 2512, 2449, 2386, 2323, 2260, 2197, 2134, 2071, 2008,
+1945, 1882, 1819, 1756, 1693, 1630, 1567, 1504, 1441, 1378, 1315, 1252, 1189,
+1126, 1063, 1000,  937,  874,  811,  748,  685,  622,  559,  496,  433,  370,
+ 307,  244,  181,  118,   55, 3584, 3521, 3458, 3395, 3332, 3269, 3206, 3143,
+3080, 3017, 2954, 2891, 2828, 2765, 2702, 2639, 2576, 2513, 2450, 2387, 2324,
+2261, 2198, 2135, 2072, 2009, 1946, 1883, 1820, 1757, 1694, 1631, 1568, 1505,
+1442, 1379, 1316, 1253, 1190, 1127, 1064, 1001,  938,  875,  812,  749,  686,
+ 623,  560,  497,  434,  371,  308,  245,  182,  119,   56, 3648, 3585, 3522,
+3459, 3396, 3333, 3270, 3207, 3144, 3081, 3018, 2955, 2892, 2829, 2766, 2703,
+2640, 2577, 2514, 2451, 2388, 2325, 2262, 2199, 2136, 2073, 2010, 1947, 1884,
+1821, 1758, 1695, 1632, 1569, 1506, 1443, 1380, 1317, 1254, 1191, 1128, 1065,
+1002,  939,  876,  813,  750,  687,  624,  561,  498,  435,  372,  309,  246,
+ 183,  120,   57, 3712, 3649, 3586, 3523, 3460, 3397, 3334, 3271, 3208, 3145,
+3082, 3019, 2956, 2893, 2830, 2767, 2704, 2641, 2578, 2515, 2452, 2389, 2326,
+2263, 2200, 2137, 2074, 2011, 1948, 1885, 1822, 1759, 1696, 1633, 1570, 1507,
+1444, 1381, 1318, 1255, 1192, 1129, 1066, 1003,  940,  877,  814,  751,  688,
+ 625,  562,  499,  436,  373,  310,  247,  184,  121,   58, 3776, 3713, 3650,
+3587, 3524, 3461, 3398, 3335, 3272, 3209, 3146, 3083, 3020, 2957, 2894, 2831,
+2768, 2705, 2642, 2579, 2516, 2453, 2390, 2327, 2264, 2201, 2138, 2075, 2012,
+1949, 1886, 1823, 1760, 1697, 1634, 1571, 1508, 1445, 1382, 1319, 1256, 1193,
+1130, 1067, 1004,  941,  878,  815,  752,  689,  626,  563,  500,  437,  374,
+ 311,  248,  185,  122,   59, 3840, 3777, 3714, 3651, 3588, 3525, 3462, 3399,
+3336, 3273, 3210, 3147, 3084, 3021, 2958, 2895, 2832, 2769, 2706, 2643, 2580,
+2517, 2454, 2391, 2328, 2265, 2202, 2139, 2076, 2013, 1950, 1887, 1824, 1761,
+1698, 1635, 1572, 1509, 1446, 1383, 1320, 1257, 1194, 1131, 1068, 1005,  942,
+ 879,  816,  753,  690,  627,  564,  501,  438,  375,  312,  249,  186,  123,
+  60, 3904, 3841, 3778, 3715, 3652, 3589, 3526, 3463, 3400, 3337, 3274, 3211,
+3148, 3085, 3022, 2959, 2896, 2833, 2770, 2707, 2644, 2581, 2518, 2455, 2392,
+2329, 2266, 2203, 2140, 2077, 2014, 1951, 1888, 1825, 1762, 1699, 1636, 1573,
+1510, 1447, 1384, 1321, 1258, 1195, 1132, 1069, 1006,  943,  880,  817,  754,
+ 691,  628,  565,  502,  439,  376,  313,  250,  187,  124,   61, 3968, 3905,
+3842, 3779, 3716, 3653, 3590, 3527, 3464, 3401, 3338, 3275, 3212, 3149, 3086,
+3023, 2960, 2897, 2834, 2771, 2708, 2645, 2582, 2519, 2456, 2393, 2330, 2267,
+2204, 2141, 2078, 2015, 1952, 1889, 1826, 1763, 1700, 1637, 1574, 1511, 1448,
+1385, 1322, 1259, 1196, 1133, 1070, 1007,  944,  881,  818,  755,  692,  629,
+ 566,  503,  440,  377,  314,  251,  188,  125,   62, 4032, 3969, 3906, 3843,
+3780, 3717, 3654, 3591, 3528, 3465, 3402, 3339, 3276, 3213, 3150, 3087, 3024,
+2961, 2898, 2835, 2772, 2709, 2646, 2583, 2520, 2457, 2394, 2331, 2268, 2205,
+2142, 2079, 2016, 1953, 1890, 1827, 1764, 1701, 1638, 1575, 1512, 1449, 1386,
+1323, 1260, 1197, 1134, 1071, 1008,  945,  882,  819,  756,  693,  630,  567,
+ 504,  441,  378,  315,  252,  189,  126,   63, 4033, 3970, 3907, 3844, 3781,
+3718, 3655, 3592, 3529, 3466, 3403, 3340, 3277, 3214, 3151, 3088, 3025, 2962,
+2899, 2836, 2773, 2710, 2647, 2584, 2521, 2458, 2395, 2332, 2269, 2206, 2143,
+2080, 2017, 1954, 1891, 1828, 1765, 1702, 1639, 1576, 1513, 1450, 1387, 1324,
+1261, 1198, 1135, 1072, 1009,  946,  883,  820,  757,  694,  631,  568,  505,
+ 442,  379,  316,  253,  190,  127, 4034, 3971, 3908, 3845, 3782, 3719, 3656,
+3593, 3530, 3467, 3404, 3341, 3278, 3215, 3152, 3089, 3026, 2963, 2900, 2837,
+2774, 2711, 2648, 2585, 2522, 2459, 2396, 2333, 2270, 2207, 2144, 2081, 2018,
+1955, 1892, 1829, 1766, 1703, 1640, 1577, 1514, 1451, 1388, 1325, 1262, 1199,
+1136, 1073, 1010,  947,  884,  821,  758,  695,  632,  569,  506,  443,  380,
+ 317,  254,  191, 4035, 3972, 3909, 3846, 3783, 3720, 3657, 3594, 3531, 3468,
+3405, 3342, 3279, 3216, 3153, 3090, 3027, 2964, 2901, 2838, 2775, 2712, 2649,
+2586, 2523, 2460, 2397, 2334, 2271, 2208, 2145, 2082, 2019, 1956, 1893, 1830,
+1767, 1704, 1641, 1578, 1515, 1452, 1389, 1326, 1263, 1200, 1137, 1074, 1011,
+ 948,  885,  822,  759,  696,  633,  570,  507,  444,  381,  318,  255, 4036,
+3973, 3910, 3847, 3784, 3721, 3658, 3595, 3532, 3469, 3406, 3343, 3280, 3217,
+3154, 3091, 3028, 2965, 2902, 2839, 2776, 2713, 2650, 2587, 2524, 2461, 2398,
+2335, 2272, 2209, 2146, 2083, 2020, 1957, 1894, 1831, 1768, 1705, 1642, 1579,
+1516, 1453, 1390, 1327, 1264, 1201, 1138, 1075, 1012,  949,  886,  823,  760,
+ 697,  634,  571,  508,  445,  382,  319, 4037, 3974, 3911, 3848, 3785, 3722,
+3659, 3596, 3533, 3470, 3407, 3344, 3281, 3218, 3155, 3092, 3029, 2966, 2903,
+2840, 2777, 2714, 2651, 2588, 2525, 2462, 2399, 2336, 2273, 2210, 2147, 2084,
+2021, 1958, 1895, 1832, 1769, 1706, 1643, 1580, 1517, 1454, 1391, 1328, 1265,
+1202, 1139, 1076, 1013,  950,  887,  824,  761,  698,  635,  572,  509,  446,
+ 383, 4038, 3975, 3912, 3849, 3786, 3723, 3660, 3597, 3534, 3471, 3408, 3345,
+3282, 3219, 3156, 3093, 3030, 2967, 2904, 2841, 2778, 2715, 2652, 2589, 2526,
+2463, 2400, 2337, 2274, 2211, 2148, 2085, 2022, 1959, 1896, 1833, 1770, 1707,
+1644, 1581, 1518, 1455, 1392, 1329, 1266, 1203, 1140, 1077, 1014,  951,  888,
+ 825,  762,  699,  636,  573,  510,  447, 4039, 3976, 3913, 3850, 3787, 3724,
+3661, 3598, 3535, 3472, 3409, 3346, 3283, 3220, 3157, 3094, 3031, 2968, 2905,
+2842, 2779, 2716, 2653, 2590, 2527, 2464, 2401, 2338, 2275, 2212, 2149, 2086,
+2023, 1960, 1897, 1834, 1771, 1708, 1645, 1582, 1519, 1456, 1393, 1330, 1267,
+1204, 1141, 1078, 1015,  952,  889,  826,  763,  700,  637,  574,  511, 4040,
+3977, 3914, 3851, 3788, 3725, 3662, 3599, 3536, 3473, 3410, 3347, 3284, 3221,
+3158, 3095, 3032, 2969, 2906, 2843, 2780, 2717, 2654, 2591, 2528, 2465, 2402,
+2339, 2276, 2213, 2150, 2087, 2024, 1961, 1898, 1835, 1772, 1709, 1646, 1583,
+1520, 1457, 1394, 1331, 1268, 1205, 1142, 1079, 1016,  953,  890,  827,  764,
+ 701,  638,  575, 4041, 3978, 3915, 3852, 3789, 3726, 3663, 3600, 3537, 3474,
+3411, 3348, 3285, 3222, 3159, 3096, 3033, 2970, 2907, 2844, 2781, 2718, 2655,
+2592, 2529, 2466, 2403, 2340, 2277, 2214, 2151, 2088, 2025, 1962, 1899, 1836,
+1773, 1710, 1647, 1584, 1521, 1458, 1395, 1332, 1269, 1206, 1143, 1080, 1017,
+ 954,  891,  828,  765,  702,  639, 4042, 3979, 3916, 3853, 3790, 3727, 3664,
+3601, 3538, 3475, 3412, 3349, 3286, 3223, 3160, 3097, 3034, 2971, 2908, 2845,
+2782, 2719, 2656, 2593, 2530, 2467, 2404, 2341, 2278, 2215, 2152, 2089, 2026,
+1963, 1900, 1837, 1774, 1711, 1648, 1585, 1522, 1459, 1396, 1333, 1270, 1207,
+1144, 1081, 1018,  955,  892,  829,  766,  703, 4043, 3980, 3917, 3854, 3791,
+3728, 3665, 3602, 3539, 3476, 3413, 3350, 3287, 3224, 3161, 3098, 3035, 2972,
+2909, 2846, 2783, 2720, 2657, 2594, 2531, 2468, 2405, 2342, 2279, 2216, 2153,
+2090, 2027, 1964, 1901, 1838, 1775, 1712, 1649, 1586, 1523, 1460, 1397, 1334,
+1271, 1208, 1145, 1082, 1019,  956,  893,  830,  767, 4044, 3981, 3918, 3855,
+3792, 3729, 3666, 3603, 3540, 3477, 3414, 3351, 3288, 3225, 3162, 3099, 3036,
+2973, 2910, 2847, 2784, 2721, 2658, 2595, 2532, 2469, 2406, 2343, 2280, 2217,
+2154, 2091, 2028, 1965, 1902, 1839, 1776, 1713, 1650, 1587, 1524, 1461, 1398,
+1335, 1272, 1209, 1146, 1083, 1020,  957,  894,  831, 4045, 3982, 3919, 3856,
+3793, 3730, 3667, 3604, 3541, 3478, 3415, 3352, 3289, 3226, 3163, 3100, 3037,
+2974, 2911, 2848, 2785, 2722, 2659, 2596, 2533, 2470, 2407, 2344, 2281, 2218,
+2155, 2092, 2029, 1966, 1903, 1840, 1777, 1714, 1651, 1588, 1525, 1462, 1399,
+1336, 1273, 1210, 1147, 1084, 1021,  958,  895, 4046, 3983, 3920, 3857, 3794,
+3731, 3668, 3605, 3542, 3479, 3416, 3353, 3290, 3227, 3164, 3101, 3038, 2975,
+2912, 2849, 2786, 2723, 2660, 2597, 2534, 2471, 2408, 2345, 2282, 2219, 2156,
+2093, 2030, 1967, 1904, 1841, 1778, 1715, 1652, 1589, 1526, 1463, 1400, 1337,
+1274, 1211, 1148, 1085, 1022,  959, 4047, 3984, 3921, 3858, 3795, 3732, 3669,
+3606, 3543, 3480, 3417, 3354, 3291, 3228, 3165, 3102, 3039, 2976, 2913, 2850,
+2787, 2724, 2661, 2598, 2535, 2472, 2409, 2346, 2283, 2220, 2157, 2094, 2031,
+1968, 1905, 1842, 1779, 1716, 1653, 1590, 1527, 1464, 1401, 1338, 1275, 1212,
+1149, 1086, 1023, 4048, 3985, 3922, 3859, 3796, 3733, 3670, 3607, 3544, 3481,
+3418, 3355, 3292, 3229, 3166, 3103, 3040, 2977, 2914, 2851, 2788, 2725, 2662,
+2599, 2536, 2473, 2410, 2347, 2284, 2221, 2158, 2095, 2032, 1969, 1906, 1843,
+1780, 1717, 1654, 1591, 1528, 1465, 1402, 1339, 1276, 1213, 1150, 1087, 4049,
+3986, 3923, 3860, 3797, 3734, 3671, 3608, 3545, 3482, 3419, 3356, 3293, 3230,
+3167, 3104, 3041, 2978, 2915, 2852, 2789, 2726, 2663, 2600, 2537, 2474, 2411,
+2348, 2285, 2222, 2159, 2096, 2033, 1970, 1907, 1844, 1781, 1718, 1655, 1592,
+1529, 1466, 1403, 1340, 1277, 1214, 1151, 4050, 3987, 3924, 3861, 3798, 3735,
+3672, 3609, 3546, 3483, 3420, 3357, 3294, 3231, 3168, 3105, 3042, 2979, 2916,
+2853, 2790, 2727, 2664, 2601, 2538, 2475, 2412, 2349, 2286, 2223, 2160, 2097,
+2034, 1971, 1908, 1845, 1782, 1719, 1656, 1593, 1530, 1467, 1404, 1341, 1278,
+1215, 4051, 3988, 3925, 3862, 3799, 3736, 3673, 3610, 3547, 3484, 3421, 3358,
+3295, 3232, 3169, 3106, 3043, 2980, 2917, 2854, 2791, 2728, 2665, 2602, 2539,
+2476, 2413, 2350, 2287, 2224, 2161, 2098, 2035, 1972, 1909, 1846, 1783, 1720,
+1657, 1594, 1531, 1468, 1405, 1342, 1279, 4052, 3989, 3926, 3863, 3800, 3737,
+3674, 3611, 3548, 3485, 3422, 3359, 3296, 3233, 3170, 3107, 3044, 2981, 2918,
+2855, 2792, 2729, 2666, 2603, 2540, 2477, 2414, 2351, 2288, 2225, 2162, 2099,
+2036, 1973, 1910, 1847, 1784, 1721, 1658, 1595, 1532, 1469, 1406, 1343, 4053,
+3990, 3927, 3864, 3801, 3738, 3675, 3612, 3549, 3486, 3423, 3360, 3297, 3234,
+3171, 3108, 3045, 2982, 2919, 2856, 2793, 2730, 2667, 2604, 2541, 2478, 2415,
+2352, 2289, 2226, 2163, 2100, 2037, 1974, 1911, 1848, 1785, 1722, 1659, 1596,
+1533, 1470, 1407, 4054, 3991, 3928, 3865, 3802, 3739, 3676, 3613, 3550, 3487,
+3424, 3361, 3298, 3235, 3172, 3109, 3046, 2983, 2920, 2857, 2794, 2731, 2668,
+2605, 2542, 2479, 2416, 2353, 2290, 2227, 2164, 2101, 2038, 1975, 1912, 1849,
+1786, 1723, 1660, 1597, 1534, 1471, 4055, 3992, 3929, 3866, 3803, 3740, 3677,
+3614, 3551, 3488, 3425, 3362, 3299, 3236, 3173, 3110, 3047, 2984, 2921, 2858,
+2795, 2732, 2669, 2606, 2543, 2480, 2417, 2354, 2291, 2228, 2165, 2102, 2039,
+1976, 1913, 1850, 1787, 1724, 1661, 1598, 1535, 4056, 3993, 3930, 3867, 3804,
+3741, 3678, 3615, 3552, 3489, 3426, 3363, 3300, 3237, 3174, 3111, 3048, 2985,
+2922, 2859, 2796, 2733, 2670, 2607, 2544, 2481, 2418, 2355, 2292, 2229, 2166,
+2103, 2040, 1977, 1914, 1851, 1788, 1725, 1662, 1599, 4057, 3994, 3931, 3868,
+3805, 3742, 3679, 3616, 3553, 3490, 3427, 3364, 3301, 3238, 3175, 3112, 3049,
+2986, 2923, 2860, 2797, 2734, 2671, 2608, 2545, 2482, 2419, 2356, 2293, 2230,
+2167, 2104, 2041, 1978, 1915, 1852, 1789, 1726, 1663, 4058, 3995, 3932, 3869,
+3806, 3743, 3680, 3617, 3554, 3491, 3428, 3365, 3302, 3239, 3176, 3113, 3050,
+2987, 2924, 2861, 2798, 2735, 2672, 2609, 2546, 2483, 2420, 2357, 2294, 2231,
+2168, 2105, 2042, 1979, 1916, 1853, 1790, 1727, 4059, 3996, 3933, 3870, 3807,
+3744, 3681, 3618, 3555, 3492, 3429, 3366, 3303, 3240, 3177, 3114, 3051, 2988,
+2925, 2862, 2799, 2736, 2673, 2610, 2547, 2484, 2421, 2358, 2295, 2232, 2169,
+2106, 2043, 1980, 1917, 1854, 1791, 4060, 3997, 3934, 3871, 3808, 3745, 3682,
+3619, 3556, 3493, 3430, 3367, 3304, 3241, 3178, 3115, 3052, 2989, 2926, 2863,
+2800, 2737, 2674, 2611, 2548, 2485, 2422, 2359, 2296, 2233, 2170, 2107, 2044,
+1981, 1918, 1855, 4061, 3998, 3935, 3872, 3809, 3746, 3683, 3620, 3557, 3494,
+3431, 3368, 3305, 3242, 3179, 3116, 3053, 2990, 2927, 2864, 2801, 2738, 2675,
+2612, 2549, 2486, 2423, 2360, 2297, 2234, 2171, 2108, 2045, 1982, 1919, 4062,
+3999, 3936, 3873, 3810, 3747, 3684, 3621, 3558, 3495, 3432, 3369, 3306, 3243,
+3180, 3117, 3054, 2991, 2928, 2865, 2802, 2739, 2676, 2613, 2550, 2487, 2424,
+2361, 2298, 2235, 2172, 2109, 2046, 1983, 4063, 4000, 3937, 3874, 3811, 3748,
+3685, 3622, 3559, 3496, 3433, 3370, 3307, 3244, 3181, 3118, 3055, 2992, 2929,
+2866, 2803, 2740, 2677, 2614, 2551, 2488, 2425, 2362, 2299, 2236, 2173, 2110,
+2047, 4064, 4001, 3938, 3875, 3812, 3749, 3686, 3623, 3560, 3497, 3434, 3371,
+3308, 3245, 3182, 3119, 3056, 2993, 2930, 2867, 2804, 2741, 2678, 2615, 2552,
+2489, 2426, 2363, 2300, 2237, 2174, 2111, 4065, 4002, 3939, 3876, 3813, 3750,
+3687, 3624, 3561, 3498, 3435, 3372, 3309, 3246, 3183, 3120, 3057, 2994, 2931,
+2868, 2805, 2742, 2679, 2616, 2553, 2490, 2427, 2364, 2301, 2238, 2175, 4066,
+4003, 3940, 3877, 3814, 3751, 3688, 3625, 3562, 3499, 3436, 3373, 3310, 3247,
+3184, 3121, 3058, 2995, 2932, 2869, 2806, 2743, 2680, 2617, 2554, 2491, 2428,
+2365, 2302, 2239, 4067, 4004, 3941, 3878, 3815, 3752, 3689, 3626, 3563, 3500,
+3437, 3374, 3311, 3248, 3185, 3122, 3059, 2996, 2933, 2870, 2807, 2744, 2681,
+2618, 2555, 2492, 2429, 2366, 2303, 4068, 4005, 3942, 3879, 3816, 3753, 3690,
+3627, 3564, 3501, 3438, 3375, 3312, 3249, 3186, 3123, 3060, 2997, 2934, 2871,
+2808, 2745, 2682, 2619, 2556, 2493, 2430, 2367, 4069, 4006, 3943, 3880, 3817,
+3754, 3691, 3628, 3565, 3502, 3439, 3376, 3313, 3250, 3187, 3124, 3061, 2998,
+2935, 2872, 2809, 2746, 2683, 2620, 2557, 2494, 2431, 4070, 4007, 3944, 3881,
+3818, 3755, 3692, 3629, 3566, 3503, 3440, 3377, 3314, 3251, 3188, 3125, 3062,
+2999, 2936, 2873, 2810, 2747, 2684, 2621, 2558, 2495, 4071, 4008, 3945, 3882,
+3819, 3756, 3693, 3630, 3567, 3504, 3441, 3378, 3315, 3252, 3189, 3126, 3063,
+3000, 2937, 2874, 2811, 2748, 2685, 2622, 2559, 4072, 4009, 3946, 3883, 3820,
+3757, 3694, 3631, 3568, 3505, 3442, 3379, 3316, 3253, 3190, 3127, 3064, 3001,
+2938, 2875, 2812, 2749, 2686, 2623, 4073, 4010, 3947, 3884, 3821, 3758, 3695,
+3632, 3569, 3506, 3443, 3380, 3317, 3254, 3191, 3128, 3065, 3002, 2939, 2876,
+2813, 2750, 2687, 4074, 4011, 3948, 3885, 3822, 3759, 3696, 3633, 3570, 3507,
+3444, 3381, 3318, 3255, 3192, 3129, 3066, 3003, 2940, 2877, 2814, 2751, 4075,
+4012, 3949, 3886, 3823, 3760, 3697, 3634, 3571, 3508, 3445, 3382, 3319, 3256,
+3193, 3130, 3067, 3004, 2941, 2878, 2815, 4076, 4013, 3950, 3887, 3824, 3761,
+3698, 3635, 3572, 3509, 3446, 3383, 3320, 3257, 3194, 3131, 3068, 3005, 2942,
+2879, 4077, 4014, 3951, 3888, 3825, 3762, 3699, 3636, 3573, 3510, 3447, 3384,
+3321, 3258, 3195, 3132, 3069, 3006, 2943, 4078, 4015, 3952, 3889, 3826, 3763,
+3700, 3637, 3574, 3511, 3448, 3385, 3322, 3259, 3196, 3133, 3070, 3007, 4079,
+4016, 3953, 3890, 3827, 3764, 3701, 3638, 3575, 3512, 3449, 3386, 3323, 3260,
+3197, 3134, 3071, 4080, 4017, 3954, 3891, 3828, 3765, 3702, 3639, 3576, 3513,
+3450, 3387, 3324, 3261, 3198, 3135, 4081, 4018, 3955, 3892, 3829, 3766, 3703,
+3640, 3577, 3514, 3451, 3388, 3325, 3262, 3199, 4082, 4019, 3956, 3893, 3830,
+3767, 3704, 3641, 3578, 3515, 3452, 3389, 3326, 3263, 4083, 4020, 3957, 3894,
+3831, 3768, 3705, 3642, 3579, 3516, 3453, 3390, 3327, 4084, 4021, 3958, 3895,
+3832, 3769, 3706, 3643, 3580, 3517, 3454, 3391, 4085, 4022, 3959, 3896, 3833,
+3770, 3707, 3644, 3581, 3518, 3455, 4086, 4023, 3960, 3897, 3834, 3771, 3708,
+3645, 3582, 3519, 4087, 4024, 3961, 3898, 3835, 3772, 3709, 3646, 3583, 4088,
+4025, 3962, 3899, 3836, 3773, 3710, 3647, 4089, 4026, 3963, 3900, 3837, 3774,
+3711, 4090, 4027, 3964, 3901, 3838, 3775, 4091, 4028, 3965, 3902, 3839, 4092,
+4029, 3966, 3903, 4093, 4030, 3967, 4094, 4031, 4095,    0,    0,    1,    0,
+   1,    2,    3,    0,    1,    2,    3,    4,    5,    6,    7,    0,    1,
+   2,    3,    4,    5,    6,    7,    8,    9,   10,   11,   12,   13,   14,
+  15,    0,    1,    2,    3,    4,    5,    6,    7,    8,    9,   10,   11,
+  12,   13,   14,   15,   16,   17,   18,   19,   20,   21,   22,   23,   24,
+  25,   26,   27,   28,   29,   30,   31,    0,    1,    2,    3,    4,    5,
+   6,    7,    8,    9,   10,   11,   12,   13,   14,   15,   16,   17,   18,
+  19,   20,   21,   22,   23,   24,   25,   26,   27,   28,   29,   30,   31,
+  63,   63,   63,   63,   63,   63,   63,   63,   63,   63,   63,   63,   63,
+  63,   63,   63,   63,   63,   63,   63,   63,   63,   63,   63,   63,   63,
+  63,   63,   63,   63,   63,   63,    0,    1,    0,    2,    1,    3,    0,
+   2,    1,    3,    4,    6,    5,    7,    0,    2,    1,    4,    3,    6,
+   5,    8,    7,   10,    9,   12,   11,   14,   13,   15,    0,    2,    1,
+   4,    3,    6,    5,    8,    7,   10,    9,   12,   11,   14,   13,   15,
+  16,   18,   17,   20,   19,   22,   21,   24,   23,   26,   25,   28,   27,
+  30,   29,   31,    0,    2,    1,    4,    3,    6,    5,    8,    7,   10,
+   9,   12,   11,   14,   13,   15,   16,   18,   17,   20,   19,   22,   21,
+  24,   23,   26,   25,   28,   27,   30,   29,   31,   32,   34,   33,   36,
+  35,   38,   37,   40,   39,   42,   41,   44,   43,   46,   45,   47,   48,
+  50,   49,   52,   51,   54,   53,   56,   55,   58,   57,   60,   59,   62,
+  61,   63,    0,    2,    1,    4,    3,    6,    5,    8,    7,   10,    9,
+  12,   11,   14,   13,   15,   16,   18,   17,   20,   19,   22,   21,   24,
+  23,   26,   25,   28,   27,   30,   29,   31,   32,   34,   33,   36,   35,
+  38,   37,   40,   39,   42,   41,   44,   43,   46,   45,   47,   48,   50,
+  49,   52,   51,   54,   53,   56,   55,   58,   57,   60,   59,   62,   61,
+  63,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,
+ 127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,
+ 127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,
+ 127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,
+ 127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,
+   0,    1,    2,    3,    0,    4,    1,    5,    2,    6,    3,    7,    0,
+   4,    1,    8,    5,    2,   12,    9,    6,    3,   13,   10,    7,   14,
+  11,   15,    0,    4,    1,    8,    5,    2,   12,    9,    6,    3,   13,
+  10,    7,   14,   11,   15,   16,   20,   17,   24,   21,   18,   28,   25,
+  22,   19,   29,   26,   23,   30,   27,   31,    0,    4,    1,    8,    5,
+   2,   12,    9,    6,    3,   13,   10,    7,   14,   11,   15,   16,   20,
+  17,   24,   21,   18,   28,   25,   22,   19,   29,   26,   23,   30,   27,
+  31,   32,   36,   33,   40,   37,   34,   44,   41,   38,   35,   45,   42,
+  39,   46,   43,   47,   48,   52,   49,   56,   53,   50,   60,   57,   54,
+  51,   61,   58,   55,   62,   59,   63,    0,    4,    1,    8,    5,    2,
+  12,    9,    6,    3,   13,   10,    7,   14,   11,   15,   16,   20,   17,
+  24,   21,   18,   28,   25,   22,   19,   29,   26,   23,   30,   27,   31,
+  32,   36,   33,   40,   37,   34,   44,   41,   38,   35,   45,   42,   39,
+  46,   43,   47,   48,   52,   49,   56,   53,   50,   60,   57,   54,   51,
+  61,   58,   55,   62,   59,   63,   64,   68,   65,   72,   69,   66,   76,
+  73,   70,   67,   77,   74,   71,   78,   75,   79,   80,   84,   81,   88,
+  85,   82,   92,   89,   86,   83,   93,   90,   87,   94,   91,   95,   96,
+ 100,   97,  104,  101,   98,  108,  105,  102,   99,  109,  106,  103,  110,
+ 107,  111,  112,  116,  113,  120,  117,  114,  124,  121,  118,  115,  125,
+ 122,  119,  126,  123,  127,    0,    4,    1,    8,    5,    2,   12,    9,
+   6,    3,   13,   10,    7,   14,   11,   15,   16,   20,   17,   24,   21,
+  18,   28,   25,   22,   19,   29,   26,   23,   30,   27,   31,   32,   36,
+  33,   40,   37,   34,   44,   41,   38,   35,   45,   42,   39,   46,   43,
+  47,   48,   52,   49,   56,   53,   50,   60,   57,   54,   51,   61,   58,
+  55,   62,   59,   63,   64,   68,   65,   72,   69,   66,   76,   73,   70,
+  67,   77,   74,   71,   78,   75,   79,   80,   84,   81,   88,   85,   82,
+  92,   89,   86,   83,   93,   90,   87,   94,   91,   95,   96,  100,   97,
+ 104,  101,   98,  108,  105,  102,   99,  109,  106,  103,  110,  107,  111,
+ 112,  116,  113,  120,  117,  114,  124,  121,  118,  115,  125,  122,  119,
+ 126,  123,  127,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,
+ 255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,
+ 255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,
+ 255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,
+ 255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,
+ 255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,
+ 255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,
+ 255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,
+ 255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,
+ 255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,
+ 255,    0,    1,    2,    3,    4,    5,    6,    7,    0,    8,    1,    9,
+   2,   10,    3,   11,    4,   12,    5,   13,    6,   14,    7,   15,    0,
+   8,    1,   16,    9,    2,   24,   17,   10,    3,   25,   18,   11,   26,
+  19,   27,    4,   12,    5,   20,   13,    6,   28,   21,   14,    7,   29,
+  22,   15,   30,   23,   31,    0,    8,    1,   16,    9,    2,   24,   17,
+  10,    3,   25,   18,   11,   26,   19,   27,   32,   40,   33,   48,   41,
+  34,   56,   49,   42,   35,   57,   50,   43,   58,   51,   59,    4,   12,
+   5,   20,   13,    6,   28,   21,   14,    7,   29,   22,   15,   30,   23,
+  31,   36,   44,   37,   52,   45,   38,   60,   53,   46,   39,   61,   54,
+  47,   62,   55,   63,    0,    8,    1,   16,    9,    2,   24,   17,   10,
+   3,   25,   18,   11,   26,   19,   27,   32,   40,   33,   48,   41,   34,
+  56,   49,   42,   35,   57,   50,   43,   58,   51,   59,    4,   12,    5,
+  20,   13,    6,   28,   21,   14,    7,   29,   22,   15,   30,   23,   31,
+  64,   72,   65,   80,   73,   66,   88,   81,   74,   67,   89,   82,   75,
+  90,   83,   91,   36,   44,   37,   52,   45,   38,   60,   53,   46,   39,
+  61,   54,   47,   62,   55,   63,   96,  104,   97,  112,  105,   98,  120,
+ 113,  106,   99,  121,  114,  107,  122,  115,  123,   68,   76,   69,   84,
+  77,   70,   92,   85,   78,   71,   93,   86,   79,   94,   87,   95,  100,
+ 108,  101,  116,  109,  102,  124,  117,  110,  103,  125,  118,  111,  126,
+ 119,  127,    0,    8,    1,   16,    9,    2,   24,   17,   10,    3,   25,
+  18,   11,   26,   19,   27,   32,   40,   33,   48,   41,   34,   56,   49,
+  42,   35,   57,   50,   43,   58,   51,   59,    4,   12,    5,   20,   13,
+   6,   28,   21,   14,    7,   29,   22,   15,   30,   23,   31,   64,   72,
+  65,   80,   73,   66,   88,   81,   74,   67,   89,   82,   75,   90,   83,
+  91,   36,   44,   37,   52,   45,   38,   60,   53,   46,   39,   61,   54,
+  47,   62,   55,   63,   96,  104,   97,  112,  105,   98,  120,  113,  106,
+  99,  121,  114,  107,  122,  115,  123,   68,   76,   69,   84,   77,   70,
+  92,   85,   78,   71,   93,   86,   79,   94,   87,   95,  128,  136,  129,
+ 144,  137,  130,  152,  145,  138,  131,  153,  146,  139,  154,  147,  155,
+ 100,  108,  101,  116,  109,  102,  124,  117,  110,  103,  125,  118,  111,
+ 126,  119,  127,  160,  168,  161,  176,  169,  162,  184,  177,  170,  163,
+ 185,  178,  171,  186,  179,  187,  132,  140,  133,  148,  141,  134,  156,
+ 149,  142,  135,  157,  150,  143,  158,  151,  159,  192,  200,  193,  208,
+ 201,  194,  216,  209,  202,  195,  217,  210,  203,  218,  211,  219,  164,
+ 172,  165,  180,  173,  166,  188,  181,  174,  167,  189,  182,  175,  190,
+ 183,  191,  224,  232,  225,  240,  233,  226,  248,  241,  234,  227,  249,
+ 242,  235,  250,  243,  251,  196,  204,  197,  212,  205,  198,  220,  213,
+ 206,  199,  221,  214,  207,  222,  215,  223,  228,  236,  229,  244,  237,
+ 230,  252,  245,  238,  231,  253,  246,  239,  254,  247,  255,    0,    8,
+   1,   16,    9,    2,   24,   17,   10,    3,   25,   18,   11,   26,   19,
+  27,   32,   40,   33,   48,   41,   34,   56,   49,   42,   35,   57,   50,
+  43,   58,   51,   59,    4,   12,    5,   20,   13,    6,   28,   21,   14,
+   7,   29,   22,   15,   30,   23,   31,   64,   72,   65,   80,   73,   66,
+  88,   81,   74,   67,   89,   82,   75,   90,   83,   91,   36,   44,   37,
+  52,   45,   38,   60,   53,   46,   39,   61,   54,   47,   62,   55,   63,
+  96,  104,   97,  112,  105,   98,  120,  113,  106,   99,  121,  114,  107,
+ 122,  115,  123,   68,   76,   69,   84,   77,   70,   92,   85,   78,   71,
+  93,   86,   79,   94,   87,   95,  128,  136,  129,  144,  137,  130,  152,
+ 145,  138,  131,  153,  146,  139,  154,  147,  155,  100,  108,  101,  116,
+ 109,  102,  124,  117,  110,  103,  125,  118,  111,  126,  119,  127,  160,
+ 168,  161,  176,  169,  162,  184,  177,  170,  163,  185,  178,  171,  186,
+ 179,  187,  132,  140,  133,  148,  141,  134,  156,  149,  142,  135,  157,
+ 150,  143,  158,  151,  159,  192,  200,  193,  208,  201,  194,  216,  209,
+ 202,  195,  217,  210,  203,  218,  211,  219,  164,  172,  165,  180,  173,
+ 166,  188,  181,  174,  167,  189,  182,  175,  190,  183,  191,  224,  232,
+ 225,  240,  233,  226,  248,  241,  234,  227,  249,  242,  235,  250,  243,
+ 251,  196,  204,  197,  212,  205,  198,  220,  213,  206,  199,  221,  214,
+ 207,  222,  215,  223,  228,  236,  229,  244,  237,  230,  252,  245,  238,
+ 231,  253,  246,  239,  254,  247,  255,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,    0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
+  10,   11,   12,   13,   14,   15,    0,   16,    1,   17,    2,   18,    3,
+  19,    4,   20,    5,   21,    6,   22,    7,   23,    8,   24,    9,   25,
+  10,   26,   11,   27,   12,   28,   13,   29,   14,   30,   15,   31,    0,
+  16,    1,   32,   17,    2,   48,   33,   18,    3,   49,   34,   19,   50,
+  35,   51,    4,   20,    5,   36,   21,    6,   52,   37,   22,    7,   53,
+  38,   23,   54,   39,   55,    8,   24,    9,   40,   25,   10,   56,   41,
+  26,   11,   57,   42,   27,   58,   43,   59,   12,   28,   13,   44,   29,
+  14,   60,   45,   30,   15,   61,   46,   31,   62,   47,   63,    0,   16,
+   1,   32,   17,    2,   48,   33,   18,    3,   49,   34,   19,   50,   35,
+  51,   64,   80,   65,   96,   81,   66,  112,   97,   82,   67,  113,   98,
+  83,  114,   99,  115,    4,   20,    5,   36,   21,    6,   52,   37,   22,
+   7,   53,   38,   23,   54,   39,   55,   68,   84,   69,  100,   85,   70,
+ 116,  101,   86,   71,  117,  102,   87,  118,  103,  119,    8,   24,    9,
+  40,   25,   10,   56,   41,   26,   11,   57,   42,   27,   58,   43,   59,
+  72,   88,   73,  104,   89,   74,  120,  105,   90,   75,  121,  106,   91,
+ 122,  107,  123,   12,   28,   13,   44,   29,   14,   60,   45,   30,   15,
+  61,   46,   31,   62,   47,   63,   76,   92,   77,  108,   93,   78,  124,
+ 109,   94,   79,  125,  110,   95,  126,  111,  127,    0,   16,    1,   32,
+  17,    2,   48,   33,   18,    3,   49,   34,   19,   50,   35,   51,   64,
+  80,   65,   96,   81,   66,  112,   97,   82,   67,  113,   98,   83,  114,
+  99,  115,    4,   20,    5,   36,   21,    6,   52,   37,   22,    7,   53,
+  38,   23,   54,   39,   55,  128,  144,  129,  160,  145,  130,  176,  161,
+ 146,  131,  177,  162,  147,  178,  163,  179,   68,   84,   69,  100,   85,
+  70,  116,  101,   86,   71,  117,  102,   87,  118,  103,  119,    8,   24,
+   9,   40,   25,   10,   56,   41,   26,   11,   57,   42,   27,   58,   43,
+  59,  192,  208,  193,  224,  209,  194,  240,  225,  210,  195,  241,  226,
+ 211,  242,  227,  243,  132,  148,  133,  164,  149,  134,  180,  165,  150,
+ 135,  181,  166,  151,  182,  167,  183,   72,   88,   73,  104,   89,   74,
+ 120,  105,   90,   75,  121,  106,   91,  122,  107,  123,   12,   28,   13,
+  44,   29,   14,   60,   45,   30,   15,   61,   46,   31,   62,   47,   63,
+ 196,  212,  197,  228,  213,  198,  244,  229,  214,  199,  245,  230,  215,
+ 246,  231,  247,  136,  152,  137,  168,  153,  138,  184,  169,  154,  139,
+ 185,  170,  155,  186,  171,  187,   76,   92,   77,  108,   93,   78,  124,
+ 109,   94,   79,  125,  110,   95,  126,  111,  127,  200,  216,  201,  232,
+ 217,  202,  248,  233,  218,  203,  249,  234,  219,  250,  235,  251,  140,
+ 156,  141,  172,  157,  142,  188,  173,  158,  143,  189,  174,  159,  190,
+ 175,  191,  204,  220,  205,  236,  221,  206,  252,  237,  222,  207,  253,
+ 238,  223,  254,  239,  255,    0,   16,    1,   32,   17,    2,   48,   33,
+  18,    3,   49,   34,   19,   50,   35,   51,   64,   80,   65,   96,   81,
+  66,  112,   97,   82,   67,  113,   98,   83,  114,   99,  115,    4,   20,
+   5,   36,   21,    6,   52,   37,   22,    7,   53,   38,   23,   54,   39,
+  55,  128,  144,  129,  160,  145,  130,  176,  161,  146,  131,  177,  162,
+ 147,  178,  163,  179,   68,   84,   69,  100,   85,   70,  116,  101,   86,
+  71,  117,  102,   87,  118,  103,  119,    8,   24,    9,   40,   25,   10,
+  56,   41,   26,   11,   57,   42,   27,   58,   43,   59,  192,  208,  193,
+ 224,  209,  194,  240,  225,  210,  195,  241,  226,  211,  242,  227,  243,
+ 132,  148,  133,  164,  149,  134,  180,  165,  150,  135,  181,  166,  151,
+ 182,  167,  183,   72,   88,   73,  104,   89,   74,  120,  105,   90,   75,
+ 121,  106,   91,  122,  107,  123,   12,   28,   13,   44,   29,   14,   60,
+  45,   30,   15,   61,   46,   31,   62,   47,   63,  256,  272,  257,  288,
+ 273,  258,  304,  289,  274,  259,  305,  290,  275,  306,  291,  307,  196,
+ 212,  197,  228,  213,  198,  244,  229,  214,  199,  245,  230,  215,  246,
+ 231,  247,  136,  152,  137,  168,  153,  138,  184,  169,  154,  139,  185,
+ 170,  155,  186,  171,  187,   76,   92,   77,  108,   93,   78,  124,  109,
+  94,   79,  125,  110,   95,  126,  111,  127,  320,  336,  321,  352,  337,
+ 322,  368,  353,  338,  323,  369,  354,  339,  370,  355,  371,  260,  276,
+ 261,  292,  277,  262,  308,  293,  278,  263,  309,  294,  279,  310,  295,
+ 311,  200,  216,  201,  232,  217,  202,  248,  233,  218,  203,  249,  234,
+ 219,  250,  235,  251,  140,  156,  141,  172,  157,  142,  188,  173,  158,
+ 143,  189,  174,  159,  190,  175,  191,  384,  400,  385,  416,  401,  386,
+ 432,  417,  402,  387,  433,  418,  403,  434,  419,  435,  324,  340,  325,
+ 356,  341,  326,  372,  357,  342,  327,  373,  358,  343,  374,  359,  375,
+ 264,  280,  265,  296,  281,  266,  312,  297,  282,  267,  313,  298,  283,
+ 314,  299,  315,  204,  220,  205,  236,  221,  206,  252,  237,  222,  207,
+ 253,  238,  223,  254,  239,  255,  448,  464,  449,  480,  465,  450,  496,
+ 481,  466,  451,  497,  482,  467,  498,  483,  499,  388,  404,  389,  420,
+ 405,  390,  436,  421,  406,  391,  437,  422,  407,  438,  423,  439,  328,
+ 344,  329,  360,  345,  330,  376,  361,  346,  331,  377,  362,  347,  378,
+ 363,  379,  268,  284,  269,  300,  285,  270,  316,  301,  286,  271,  317,
+ 302,  287,  318,  303,  319,  452,  468,  453,  484,  469,  454,  500,  485,
+ 470,  455,  501,  486,  471,  502,  487,  503,  392,  408,  393,  424,  409,
+ 394,  440,  425,  410,  395,  441,  426,  411,  442,  427,  443,  332,  348,
+ 333,  364,  349,  334,  380,  365,  350,  335,  381,  366,  351,  382,  367,
+ 383,  456,  472,  457,  488,  473,  458,  504,  489,  474,  459,  505,  490,
+ 475,  506,  491,  507,  396,  412,  397,  428,  413,  398,  444,  429,  414,
+ 399,  445,  430,  415,  446,  431,  447,  460,  476,  461,  492,  477,  462,
+ 508,  493,  478,  463,  509,  494,  479,  510,  495,  511,    0,   16,    1,
+  32,   17,    2,   48,   33,   18,    3,   49,   34,   19,   50,   35,   51,
+  64,   80,   65,   96,   81,   66,  112,   97,   82,   67,  113,   98,   83,
+ 114,   99,  115,    4,   20,    5,   36,   21,    6,   52,   37,   22,    7,
+  53,   38,   23,   54,   39,   55,  128,  144,  129,  160,  145,  130,  176,
+ 161,  146,  131,  177,  162,  147,  178,  163,  179,   68,   84,   69,  100,
+  85,   70,  116,  101,   86,   71,  117,  102,   87,  118,  103,  119,    8,
+  24,    9,   40,   25,   10,   56,   41,   26,   11,   57,   42,   27,   58,
+  43,   59,  192,  208,  193,  224,  209,  194,  240,  225,  210,  195,  241,
+ 226,  211,  242,  227,  243,  132,  148,  133,  164,  149,  134,  180,  165,
+ 150,  135,  181,  166,  151,  182,  167,  183,   72,   88,   73,  104,   89,
+  74,  120,  105,   90,   75,  121,  106,   91,  122,  107,  123,   12,   28,
+  13,   44,   29,   14,   60,   45,   30,   15,   61,   46,   31,   62,   47,
+  63,  256,  272,  257,  288,  273,  258,  304,  289,  274,  259,  305,  290,
+ 275,  306,  291,  307,  196,  212,  197,  228,  213,  198,  244,  229,  214,
+ 199,  245,  230,  215,  246,  231,  247,  136,  152,  137,  168,  153,  138,
+ 184,  169,  154,  139,  185,  170,  155,  186,  171,  187,   76,   92,   77,
+ 108,   93,   78,  124,  109,   94,   79,  125,  110,   95,  126,  111,  127,
+ 320,  336,  321,  352,  337,  322,  368,  353,  338,  323,  369,  354,  339,
+ 370,  355,  371,  260,  276,  261,  292,  277,  262,  308,  293,  278,  263,
+ 309,  294,  279,  310,  295,  311,  200,  216,  201,  232,  217,  202,  248,
+ 233,  218,  203,  249,  234,  219,  250,  235,  251,  140,  156,  141,  172,
+ 157,  142,  188,  173,  158,  143,  189,  174,  159,  190,  175,  191,  384,
+ 400,  385,  416,  401,  386,  432,  417,  402,  387,  433,  418,  403,  434,
+ 419,  435,  324,  340,  325,  356,  341,  326,  372,  357,  342,  327,  373,
+ 358,  343,  374,  359,  375,  264,  280,  265,  296,  281,  266,  312,  297,
+ 282,  267,  313,  298,  283,  314,  299,  315,  204,  220,  205,  236,  221,
+ 206,  252,  237,  222,  207,  253,  238,  223,  254,  239,  255,  448,  464,
+ 449,  480,  465,  450,  496,  481,  466,  451,  497,  482,  467,  498,  483,
+ 499,  388,  404,  389,  420,  405,  390,  436,  421,  406,  391,  437,  422,
+ 407,  438,  423,  439,  328,  344,  329,  360,  345,  330,  376,  361,  346,
+ 331,  377,  362,  347,  378,  363,  379,  268,  284,  269,  300,  285,  270,
+ 316,  301,  286,  271,  317,  302,  287,  318,  303,  319,  452,  468,  453,
+ 484,  469,  454,  500,  485,  470,  455,  501,  486,  471,  502,  487,  503,
+ 392,  408,  393,  424,  409,  394,  440,  425,  410,  395,  441,  426,  411,
+ 442,  427,  443,  332,  348,  333,  364,  349,  334,  380,  365,  350,  335,
+ 381,  366,  351,  382,  367,  383,  456,  472,  457,  488,  473,  458,  504,
+ 489,  474,  459,  505,  490,  475,  506,  491,  507,  396,  412,  397,  428,
+ 413,  398,  444,  429,  414,  399,  445,  430,  415,  446,  431,  447,  460,
+ 476,  461,  492,  477,  462,  508,  493,  478,  463,  509,  494,  479,  510,
+ 495,  511, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023,    0,    1,    2,    3,    4,    5,
+   6,    7,    8,    9,   10,   11,   12,   13,   14,   15,   16,   17,   18,
+  19,   20,   21,   22,   23,   24,   25,   26,   27,   28,   29,   30,   31,
+   0,   32,    1,   33,    2,   34,    3,   35,    4,   36,    5,   37,    6,
+  38,    7,   39,    8,   40,    9,   41,   10,   42,   11,   43,   12,   44,
+  13,   45,   14,   46,   15,   47,   16,   48,   17,   49,   18,   50,   19,
+  51,   20,   52,   21,   53,   22,   54,   23,   55,   24,   56,   25,   57,
+  26,   58,   27,   59,   28,   60,   29,   61,   30,   62,   31,   63,    0,
+  32,    1,   64,   33,    2,   96,   65,   34,    3,   97,   66,   35,   98,
+  67,   99,    4,   36,    5,   68,   37,    6,  100,   69,   38,    7,  101,
+  70,   39,  102,   71,  103,    8,   40,    9,   72,   41,   10,  104,   73,
+  42,   11,  105,   74,   43,  106,   75,  107,   12,   44,   13,   76,   45,
+  14,  108,   77,   46,   15,  109,   78,   47,  110,   79,  111,   16,   48,
+  17,   80,   49,   18,  112,   81,   50,   19,  113,   82,   51,  114,   83,
+ 115,   20,   52,   21,   84,   53,   22,  116,   85,   54,   23,  117,   86,
+  55,  118,   87,  119,   24,   56,   25,   88,   57,   26,  120,   89,   58,
+  27,  121,   90,   59,  122,   91,  123,   28,   60,   29,   92,   61,   30,
+ 124,   93,   62,   31,  125,   94,   63,  126,   95,  127,    0,   32,    1,
+  64,   33,    2,   96,   65,   34,    3,   97,   66,   35,   98,   67,   99,
+ 128,  160,  129,  192,  161,  130,  224,  193,  162,  131,  225,  194,  163,
+ 226,  195,  227,    4,   36,    5,   68,   37,    6,  100,   69,   38,    7,
+ 101,   70,   39,  102,   71,  103,  132,  164,  133,  196,  165,  134,  228,
+ 197,  166,  135,  229,  198,  167,  230,  199,  231,    8,   40,    9,   72,
+  41,   10,  104,   73,   42,   11,  105,   74,   43,  106,   75,  107,  136,
+ 168,  137,  200,  169,  138,  232,  201,  170,  139,  233,  202,  171,  234,
+ 203,  235,   12,   44,   13,   76,   45,   14,  108,   77,   46,   15,  109,
+  78,   47,  110,   79,  111,  140,  172,  141,  204,  173,  142,  236,  205,
+ 174,  143,  237,  206,  175,  238,  207,  239,   16,   48,   17,   80,   49,
+  18,  112,   81,   50,   19,  113,   82,   51,  114,   83,  115,  144,  176,
+ 145,  208,  177,  146,  240,  209,  178,  147,  241,  210,  179,  242,  211,
+ 243,   20,   52,   21,   84,   53,   22,  116,   85,   54,   23,  117,   86,
+  55,  118,   87,  119,  148,  180,  149,  212,  181,  150,  244,  213,  182,
+ 151,  245,  214,  183,  246,  215,  247,   24,   56,   25,   88,   57,   26,
+ 120,   89,   58,   27,  121,   90,   59,  122,   91,  123,  152,  184,  153,
+ 216,  185,  154,  248,  217,  186,  155,  249,  218,  187,  250,  219,  251,
+  28,   60,   29,   92,   61,   30,  124,   93,   62,   31,  125,   94,   63,
+ 126,   95,  127,  156,  188,  157,  220,  189,  158,  252,  221,  190,  159,
+ 253,  222,  191,  254,  223,  255,    0,   32,    1,   64,   33,    2,   96,
+  65,   34,    3,   97,   66,   35,   98,   67,   99,  128,  160,  129,  192,
+ 161,  130,  224,  193,  162,  131,  225,  194,  163,  226,  195,  227,    4,
+  36,    5,   68,   37,    6,  100,   69,   38,    7,  101,   70,   39,  102,
+  71,  103,  256,  288,  257,  320,  289,  258,  352,  321,  290,  259,  353,
+ 322,  291,  354,  323,  355,  132,  164,  133,  196,  165,  134,  228,  197,
+ 166,  135,  229,  198,  167,  230,  199,  231,    8,   40,    9,   72,   41,
+  10,  104,   73,   42,   11,  105,   74,   43,  106,   75,  107,  384,  416,
+ 385,  448,  417,  386,  480,  449,  418,  387,  481,  450,  419,  482,  451,
+ 483,  260,  292,  261,  324,  293,  262,  356,  325,  294,  263,  357,  326,
+ 295,  358,  327,  359,  136,  168,  137,  200,  169,  138,  232,  201,  170,
+ 139,  233,  202,  171,  234,  203,  235,   12,   44,   13,   76,   45,   14,
+ 108,   77,   46,   15,  109,   78,   47,  110,   79,  111,  388,  420,  389,
+ 452,  421,  390,  484,  453,  422,  391,  485,  454,  423,  486,  455,  487,
+ 264,  296,  265,  328,  297,  266,  360,  329,  298,  267,  361,  330,  299,
+ 362,  331,  363,  140,  172,  141,  204,  173,  142,  236,  205,  174,  143,
+ 237,  206,  175,  238,  207,  239,   16,   48,   17,   80,   49,   18,  112,
+  81,   50,   19,  113,   82,   51,  114,   83,  115,  392,  424,  393,  456,
+ 425,  394,  488,  457,  426,  395,  489,  458,  427,  490,  459,  491,  268,
+ 300,  269,  332,  301,  270,  364,  333,  302,  271,  365,  334,  303,  366,
+ 335,  367,  144,  176,  145,  208,  177,  146,  240,  209,  178,  147,  241,
+ 210,  179,  242,  211,  243,   20,   52,   21,   84,   53,   22,  116,   85,
+  54,   23,  117,   86,   55,  118,   87,  119,  396,  428,  397,  460,  429,
+ 398,  492,  461,  430,  399,  493,  462,  431,  494,  463,  495,  272,  304,
+ 273,  336,  305,  274,  368,  337,  306,  275,  369,  338,  307,  370,  339,
+ 371,  148,  180,  149,  212,  181,  150,  244,  213,  182,  151,  245,  214,
+ 183,  246,  215,  247,   24,   56,   25,   88,   57,   26,  120,   89,   58,
+  27,  121,   90,   59,  122,   91,  123,  400,  432,  401,  464,  433,  402,
+ 496,  465,  434,  403,  497,  466,  435,  498,  467,  499,  276,  308,  277,
+ 340,  309,  278,  372,  341,  310,  279,  373,  342,  311,  374,  343,  375,
+ 152,  184,  153,  216,  185,  154,  248,  217,  186,  155,  249,  218,  187,
+ 250,  219,  251,   28,   60,   29,   92,   61,   30,  124,   93,   62,   31,
+ 125,   94,   63,  126,   95,  127,  404,  436,  405,  468,  437,  406,  500,
+ 469,  438,  407,  501,  470,  439,  502,  471,  503,  280,  312,  281,  344,
+ 313,  282,  376,  345,  314,  283,  377,  346,  315,  378,  347,  379,  156,
+ 188,  157,  220,  189,  158,  252,  221,  190,  159,  253,  222,  191,  254,
+ 223,  255,  408,  440,  409,  472,  441,  410,  504,  473,  442,  411,  505,
+ 474,  443,  506,  475,  507,  284,  316,  285,  348,  317,  286,  380,  349,
+ 318,  287,  381,  350,  319,  382,  351,  383,  412,  444,  413,  476,  445,
+ 414,  508,  477,  446,  415,  509,  478,  447,  510,  479,  511,    0,   32,
+   1,   64,   33,    2,   96,   65,   34,    3,   97,   66,   35,   98,   67,
+  99,  128,  160,  129,  192,  161,  130,  224,  193,  162,  131,  225,  194,
+ 163,  226,  195,  227,    4,   36,    5,   68,   37,    6,  100,   69,   38,
+   7,  101,   70,   39,  102,   71,  103,  256,  288,  257,  320,  289,  258,
+ 352,  321,  290,  259,  353,  322,  291,  354,  323,  355,  132,  164,  133,
+ 196,  165,  134,  228,  197,  166,  135,  229,  198,  167,  230,  199,  231,
+   8,   40,    9,   72,   41,   10,  104,   73,   42,   11,  105,   74,   43,
+ 106,   75,  107,  384,  416,  385,  448,  417,  386,  480,  449,  418,  387,
+ 481,  450,  419,  482,  451,  483,  260,  292,  261,  324,  293,  262,  356,
+ 325,  294,  263,  357,  326,  295,  358,  327,  359,  136,  168,  137,  200,
+ 169,  138,  232,  201,  170,  139,  233,  202,  171,  234,  203,  235,   12,
+  44,   13,   76,   45,   14,  108,   77,   46,   15,  109,   78,   47,  110,
+  79,  111,  512,  544,  513,  576,  545,  514,  608,  577,  546,  515,  609,
+ 578,  547,  610,  579,  611,  388,  420,  389,  452,  421,  390,  484,  453,
+ 422,  391,  485,  454,  423,  486,  455,  487,  264,  296,  265,  328,  297,
+ 266,  360,  329,  298,  267,  361,  330,  299,  362,  331,  363,  140,  172,
+ 141,  204,  173,  142,  236,  205,  174,  143,  237,  206,  175,  238,  207,
+ 239,   16,   48,   17,   80,   49,   18,  112,   81,   50,   19,  113,   82,
+  51,  114,   83,  115,  640,  672,  641,  704,  673,  642,  736,  705,  674,
+ 643,  737,  706,  675,  738,  707,  739,  516,  548,  517,  580,  549,  518,
+ 612,  581,  550,  519,  613,  582,  551,  614,  583,  615,  392,  424,  393,
+ 456,  425,  394,  488,  457,  426,  395,  489,  458,  427,  490,  459,  491,
+ 268,  300,  269,  332,  301,  270,  364,  333,  302,  271,  365,  334,  303,
+ 366,  335,  367,  144,  176,  145,  208,  177,  146,  240,  209,  178,  147,
+ 241,  210,  179,  242,  211,  243,   20,   52,   21,   84,   53,   22,  116,
+  85,   54,   23,  117,   86,   55,  118,   87,  119,  768,  800,  769,  832,
+ 801,  770,  864,  833,  802,  771,  865,  834,  803,  866,  835,  867,  644,
+ 676,  645,  708,  677,  646,  740,  709,  678,  647,  741,  710,  679,  742,
+ 711,  743,  520,  552,  521,  584,  553,  522,  616,  585,  554,  523,  617,
+ 586,  555,  618,  587,  619,  396,  428,  397,  460,  429,  398,  492,  461,
+ 430,  399,  493,  462,  431,  494,  463,  495,  272,  304,  273,  336,  305,
+ 274,  368,  337,  306,  275,  369,  338,  307,  370,  339,  371,  148,  180,
+ 149,  212,  181,  150,  244,  213,  182,  151,  245,  214,  183,  246,  215,
+ 247,   24,   56,   25,   88,   57,   26,  120,   89,   58,   27,  121,   90,
+  59,  122,   91,  123,  896,  928,  897,  960,  929,  898,  992,  961,  930,
+ 899,  993,  962,  931,  994,  963,  995,  772,  804,  773,  836,  805,  774,
+ 868,  837,  806,  775,  869,  838,  807,  870,  839,  871,  648,  680,  649,
+ 712,  681,  650,  744,  713,  682,  651,  745,  714,  683,  746,  715,  747,
+ 524,  556,  525,  588,  557,  526,  620,  589,  558,  527,  621,  590,  559,
+ 622,  591,  623,  400,  432,  401,  464,  433,  402,  496,  465,  434,  403,
+ 497,  466,  435,  498,  467,  499,  276,  308,  277,  340,  309,  278,  372,
+ 341,  310,  279,  373,  342,  311,  374,  343,  375,  152,  184,  153,  216,
+ 185,  154,  248,  217,  186,  155,  249,  218,  187,  250,  219,  251,   28,
+  60,   29,   92,   61,   30,  124,   93,   62,   31,  125,   94,   63,  126,
+  95,  127,  900,  932,  901,  964,  933,  902,  996,  965,  934,  903,  997,
+ 966,  935,  998,  967,  999,  776,  808,  777,  840,  809,  778,  872,  841,
+ 810,  779,  873,  842,  811,  874,  843,  875,  652,  684,  653,  716,  685,
+ 654,  748,  717,  686,  655,  749,  718,  687,  750,  719,  751,  528,  560,
+ 529,  592,  561,  530,  624,  593,  562,  531,  625,  594,  563,  626,  595,
+ 627,  404,  436,  405,  468,  437,  406,  500,  469,  438,  407,  501,  470,
+ 439,  502,  471,  503,  280,  312,  281,  344,  313,  282,  376,  345,  314,
+ 283,  377,  346,  315,  378,  347,  379,  156,  188,  157,  220,  189,  158,
+ 252,  221,  190,  159,  253,  222,  191,  254,  223,  255,  904,  936,  905,
+ 968,  937,  906, 1000,  969,  938,  907, 1001,  970,  939, 1002,  971, 1003,
+ 780,  812,  781,  844,  813,  782,  876,  845,  814,  783,  877,  846,  815,
+ 878,  847,  879,  656,  688,  657,  720,  689,  658,  752,  721,  690,  659,
+ 753,  722,  691,  754,  723,  755,  532,  564,  533,  596,  565,  534,  628,
+ 597,  566,  535,  629,  598,  567,  630,  599,  631,  408,  440,  409,  472,
+ 441,  410,  504,  473,  442,  411,  505,  474,  443,  506,  475,  507,  284,
+ 316,  285,  348,  317,  286,  380,  349,  318,  287,  381,  350,  319,  382,
+ 351,  383,  908,  940,  909,  972,  941,  910, 1004,  973,  942,  911, 1005,
+ 974,  943, 1006,  975, 1007,  784,  816,  785,  848,  817,  786,  880,  849,
+ 818,  787,  881,  850,  819,  882,  851,  883,  660,  692,  661,  724,  693,
+ 662,  756,  725,  694,  663,  757,  726,  695,  758,  727,  759,  536,  568,
+ 537,  600,  569,  538,  632,  601,  570,  539,  633,  602,  571,  634,  603,
+ 635,  412,  444,  413,  476,  445,  414,  508,  477,  446,  415,  509,  478,
+ 447,  510,  479,  511,  912,  944,  913,  976,  945,  914, 1008,  977,  946,
+ 915, 1009,  978,  947, 1010,  979, 1011,  788,  820,  789,  852,  821,  790,
+ 884,  853,  822,  791,  885,  854,  823,  886,  855,  887,  664,  696,  665,
+ 728,  697,  666,  760,  729,  698,  667,  761,  730,  699,  762,  731,  763,
+ 540,  572,  541,  604,  573,  542,  636,  605,  574,  543,  637,  606,  575,
+ 638,  607,  639,  916,  948,  917,  980,  949,  918, 1012,  981,  950,  919,
+1013,  982,  951, 1014,  983, 1015,  792,  824,  793,  856,  825,  794,  888,
+ 857,  826,  795,  889,  858,  827,  890,  859,  891,  668,  700,  669,  732,
+ 701,  670,  764,  733,  702,  671,  765,  734,  703,  766,  735,  767,  920,
+ 952,  921,  984,  953,  922, 1016,  985,  954,  923, 1017,  986,  955, 1018,
+ 987, 1019,  796,  828,  797,  860,  829,  798,  892,  861,  830,  799,  893,
+ 862,  831,  894,  863,  895,  924,  956,  925,  988,  957,  926, 1020,  989,
+ 958,  927, 1021,  990,  959, 1022,  991, 1023,    0,   32,    1,   64,   33,
+   2,   96,   65,   34,    3,   97,   66,   35,   98,   67,   99,  128,  160,
+ 129,  192,  161,  130,  224,  193,  162,  131,  225,  194,  163,  226,  195,
+ 227,    4,   36,    5,   68,   37,    6,  100,   69,   38,    7,  101,   70,
+  39,  102,   71,  103,  256,  288,  257,  320,  289,  258,  352,  321,  290,
+ 259,  353,  322,  291,  354,  323,  355,  132,  164,  133,  196,  165,  134,
+ 228,  197,  166,  135,  229,  198,  167,  230,  199,  231,    8,   40,    9,
+  72,   41,   10,  104,   73,   42,   11,  105,   74,   43,  106,   75,  107,
+ 384,  416,  385,  448,  417,  386,  480,  449,  418,  387,  481,  450,  419,
+ 482,  451,  483,  260,  292,  261,  324,  293,  262,  356,  325,  294,  263,
+ 357,  326,  295,  358,  327,  359,  136,  168,  137,  200,  169,  138,  232,
+ 201,  170,  139,  233,  202,  171,  234,  203,  235,   12,   44,   13,   76,
+  45,   14,  108,   77,   46,   15,  109,   78,   47,  110,   79,  111,  512,
+ 544,  513,  576,  545,  514,  608,  577,  546,  515,  609,  578,  547,  610,
+ 579,  611,  388,  420,  389,  452,  421,  390,  484,  453,  422,  391,  485,
+ 454,  423,  486,  455,  487,  264,  296,  265,  328,  297,  266,  360,  329,
+ 298,  267,  361,  330,  299,  362,  331,  363,  140,  172,  141,  204,  173,
+ 142,  236,  205,  174,  143,  237,  206,  175,  238,  207,  239,   16,   48,
+  17,   80,   49,   18,  112,   81,   50,   19,  113,   82,   51,  114,   83,
+ 115,  640,  672,  641,  704,  673,  642,  736,  705,  674,  643,  737,  706,
+ 675,  738,  707,  739,  516,  548,  517,  580,  549,  518,  612,  581,  550,
+ 519,  613,  582,  551,  614,  583,  615,  392,  424,  393,  456,  425,  394,
+ 488,  457,  426,  395,  489,  458,  427,  490,  459,  491,  268,  300,  269,
+ 332,  301,  270,  364,  333,  302,  271,  365,  334,  303,  366,  335,  367,
+ 144,  176,  145,  208,  177,  146,  240,  209,  178,  147,  241,  210,  179,
+ 242,  211,  243,   20,   52,   21,   84,   53,   22,  116,   85,   54,   23,
+ 117,   86,   55,  118,   87,  119,  768,  800,  769,  832,  801,  770,  864,
+ 833,  802,  771,  865,  834,  803,  866,  835,  867,  644,  676,  645,  708,
+ 677,  646,  740,  709,  678,  647,  741,  710,  679,  742,  711,  743,  520,
+ 552,  521,  584,  553,  522,  616,  585,  554,  523,  617,  586,  555,  618,
+ 587,  619,  396,  428,  397,  460,  429,  398,  492,  461,  430,  399,  493,
+ 462,  431,  494,  463,  495,  272,  304,  273,  336,  305,  274,  368,  337,
+ 306,  275,  369,  338,  307,  370,  339,  371,  148,  180,  149,  212,  181,
+ 150,  244,  213,  182,  151,  245,  214,  183,  246,  215,  247,   24,   56,
+  25,   88,   57,   26,  120,   89,   58,   27,  121,   90,   59,  122,   91,
+ 123,  896,  928,  897,  960,  929,  898,  992,  961,  930,  899,  993,  962,
+ 931,  994,  963,  995,  772,  804,  773,  836,  805,  774,  868,  837,  806,
+ 775,  869,  838,  807,  870,  839,  871,  648,  680,  649,  712,  681,  650,
+ 744,  713,  682,  651,  745,  714,  683,  746,  715,  747,  524,  556,  525,
+ 588,  557,  526,  620,  589,  558,  527,  621,  590,  559,  622,  591,  623,
+ 400,  432,  401,  464,  433,  402,  496,  465,  434,  403,  497,  466,  435,
+ 498,  467,  499,  276,  308,  277,  340,  309,  278,  372,  341,  310,  279,
+ 373,  342,  311,  374,  343,  375,  152,  184,  153,  216,  185,  154,  248,
+ 217,  186,  155,  249,  218,  187,  250,  219,  251,   28,   60,   29,   92,
+  61,   30,  124,   93,   62,   31,  125,   94,   63,  126,   95,  127,  900,
+ 932,  901,  964,  933,  902,  996,  965,  934,  903,  997,  966,  935,  998,
+ 967,  999,  776,  808,  777,  840,  809,  778,  872,  841,  810,  779,  873,
+ 842,  811,  874,  843,  875,  652,  684,  653,  716,  685,  654,  748,  717,
+ 686,  655,  749,  718,  687,  750,  719,  751,  528,  560,  529,  592,  561,
+ 530,  624,  593,  562,  531,  625,  594,  563,  626,  595,  627,  404,  436,
+ 405,  468,  437,  406,  500,  469,  438,  407,  501,  470,  439,  502,  471,
+ 503,  280,  312,  281,  344,  313,  282,  376,  345,  314,  283,  377,  346,
+ 315,  378,  347,  379,  156,  188,  157,  220,  189,  158,  252,  221,  190,
+ 159,  253,  222,  191,  254,  223,  255,  904,  936,  905,  968,  937,  906,
+1000,  969,  938,  907, 1001,  970,  939, 1002,  971, 1003,  780,  812,  781,
+ 844,  813,  782,  876,  845,  814,  783,  877,  846,  815,  878,  847,  879,
+ 656,  688,  657,  720,  689,  658,  752,  721,  690,  659,  753,  722,  691,
+ 754,  723,  755,  532,  564,  533,  596,  565,  534,  628,  597,  566,  535,
+ 629,  598,  567,  630,  599,  631,  408,  440,  409,  472,  441,  410,  504,
+ 473,  442,  411,  505,  474,  443,  506,  475,  507,  284,  316,  285,  348,
+ 317,  286,  380,  349,  318,  287,  381,  350,  319,  382,  351,  383,  908,
+ 940,  909,  972,  941,  910, 1004,  973,  942,  911, 1005,  974,  943, 1006,
+ 975, 1007,  784,  816,  785,  848,  817,  786,  880,  849,  818,  787,  881,
+ 850,  819,  882,  851,  883,  660,  692,  661,  724,  693,  662,  756,  725,
+ 694,  663,  757,  726,  695,  758,  727,  759,  536,  568,  537,  600,  569,
+ 538,  632,  601,  570,  539,  633,  602,  571,  634,  603,  635,  412,  444,
+ 413,  476,  445,  414,  508,  477,  446,  415,  509,  478,  447,  510,  479,
+ 511,  912,  944,  913,  976,  945,  914, 1008,  977,  946,  915, 1009,  978,
+ 947, 1010,  979, 1011,  788,  820,  789,  852,  821,  790,  884,  853,  822,
+ 791,  885,  854,  823,  886,  855,  887,  664,  696,  665,  728,  697,  666,
+ 760,  729,  698,  667,  761,  730,  699,  762,  731,  763,  540,  572,  541,
+ 604,  573,  542,  636,  605,  574,  543,  637,  606,  575,  638,  607,  639,
+ 916,  948,  917,  980,  949,  918, 1012,  981,  950,  919, 1013,  982,  951,
+1014,  983, 1015,  792,  824,  793,  856,  825,  794,  888,  857,  826,  795,
+ 889,  858,  827,  890,  859,  891,  668,  700,  669,  732,  701,  670,  764,
+ 733,  702,  671,  765,  734,  703,  766,  735,  767,  920,  952,  921,  984,
+ 953,  922, 1016,  985,  954,  923, 1017,  986,  955, 1018,  987, 1019,  796,
+ 828,  797,  860,  829,  798,  892,  861,  830,  799,  893,  862,  831,  894,
+ 863,  895,  924,  956,  925,  988,  957,  926, 1020,  989,  958,  927, 1021,
+ 990,  959, 1022,  991, 1023, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047,    0,    1,    2,    3,    4,    5,    6,    7,    8,    9,   10,
+  11,   12,   13,   14,   15,   16,   17,   18,   19,   20,   21,   22,   23,
+  24,   25,   26,   27,   28,   29,   30,   31,   63,   63,   63,   63,   63,
+  63,   63,   63,   63,   63,   63,   63,   63,   63,   63,   63,   63,   63,
+  63,   63,   63,   63,   63,   63,   63,   63,   63,   63,   63,   63,   63,
+  63,    0,   64,    1,   65,    2,   66,    3,   67,    4,   68,    5,   69,
+   6,   70,    7,   71,    8,   72,    9,   73,   10,   74,   11,   75,   12,
+  76,   13,   77,   14,   78,   15,   79,   16,   80,   17,   81,   18,   82,
+  19,   83,   20,   84,   21,   85,   22,   86,   23,   87,   24,   88,   25,
+  89,   26,   90,   27,   91,   28,   92,   29,   93,   30,   94,   31,   95,
+ 127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,
+ 127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,
+ 127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,
+ 127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,
+ 127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,    0,
+  64,    1,  128,   65,    2,  192,  129,   66,    3,  193,  130,   67,  194,
+ 131,  195,    4,   68,    5,  132,   69,    6,  196,  133,   70,    7,  197,
+ 134,   71,  198,  135,  199,    8,   72,    9,  136,   73,   10,  200,  137,
+  74,   11,  201,  138,   75,  202,  139,  203,   12,   76,   13,  140,   77,
+  14,  204,  141,   78,   15,  205,  142,   79,  206,  143,  207,   16,   80,
+  17,  144,   81,   18,  208,  145,   82,   19,  209,  146,   83,  210,  147,
+ 211,   20,   84,   21,  148,   85,   22,  212,  149,   86,   23,  213,  150,
+  87,  214,  151,  215,   24,   88,   25,  152,   89,   26,  216,  153,   90,
+  27,  217,  154,   91,  218,  155,  219,   28,   92,   29,  156,   93,   30,
+ 220,  157,   94,   31,  221,  158,   95,  222,  159,  223,  255,  255,  255,
+ 255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,
+ 255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,
+ 255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,
+ 255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,
+ 255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,
+ 255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,
+ 255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,
+ 255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,
+ 255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,
+ 255,  255,  255,  255,  255,  255,  255,  255,    0,   64,    1,  128,   65,
+   2,  192,  129,   66,    3,  193,  130,   67,  194,  131,  195,  256,  320,
+ 257,  384,  321,  258,  448,  385,  322,  259,  449,  386,  323,  450,  387,
+ 451,    4,   68,    5,  132,   69,    6,  196,  133,   70,    7,  197,  134,
+  71,  198,  135,  199,  260,  324,  261,  388,  325,  262,  452,  389,  326,
+ 263,  453,  390,  327,  454,  391,  455,    8,   72,    9,  136,   73,   10,
+ 200,  137,   74,   11,  201,  138,   75,  202,  139,  203,  264,  328,  265,
+ 392,  329,  266,  456,  393,  330,  267,  457,  394,  331,  458,  395,  459,
+  12,   76,   13,  140,   77,   14,  204,  141,   78,   15,  205,  142,   79,
+ 206,  143,  207,  268,  332,  269,  396,  333,  270,  460,  397,  334,  271,
+ 461,  398,  335,  462,  399,  463,   16,   80,   17,  144,   81,   18,  208,
+ 145,   82,   19,  209,  146,   83,  210,  147,  211,  272,  336,  273,  400,
+ 337,  274,  464,  401,  338,  275,  465,  402,  339,  466,  403,  467,   20,
+  84,   21,  148,   85,   22,  212,  149,   86,   23,  213,  150,   87,  214,
+ 151,  215,  276,  340,  277,  404,  341,  278,  468,  405,  342,  279,  469,
+ 406,  343,  470,  407,  471,   24,   88,   25,  152,   89,   26,  216,  153,
+  90,   27,  217,  154,   91,  218,  155,  219,  280,  344,  281,  408,  345,
+ 282,  472,  409,  346,  283,  473,  410,  347,  474,  411,  475,   28,   92,
+  29,  156,   93,   30,  220,  157,   94,   31,  221,  158,   95,  222,  159,
+ 223,  284,  348,  285,  412,  349,  286,  476,  413,  350,  287,  477,  414,
+ 351,  478,  415,  479,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+   0,   64,    1,  128,   65,    2,  192,  129,   66,    3,  193,  130,   67,
+ 194,  131,  195,  256,  320,  257,  384,  321,  258,  448,  385,  322,  259,
+ 449,  386,  323,  450,  387,  451,    4,   68,    5,  132,   69,    6,  196,
+ 133,   70,    7,  197,  134,   71,  198,  135,  199,  512,  576,  513,  640,
+ 577,  514,  704,  641,  578,  515,  705,  642,  579,  706,  643,  707,  260,
+ 324,  261,  388,  325,  262,  452,  389,  326,  263,  453,  390,  327,  454,
+ 391,  455,    8,   72,    9,  136,   73,   10,  200,  137,   74,   11,  201,
+ 138,   75,  202,  139,  203,  768,  832,  769,  896,  833,  770,  960,  897,
+ 834,  771,  961,  898,  835,  962,  899,  963,  516,  580,  517,  644,  581,
+ 518,  708,  645,  582,  519,  709,  646,  583,  710,  647,  711,  264,  328,
+ 265,  392,  329,  266,  456,  393,  330,  267,  457,  394,  331,  458,  395,
+ 459,   12,   76,   13,  140,   77,   14,  204,  141,   78,   15,  205,  142,
+  79,  206,  143,  207,  772,  836,  773,  900,  837,  774,  964,  901,  838,
+ 775,  965,  902,  839,  966,  903,  967,  520,  584,  521,  648,  585,  522,
+ 712,  649,  586,  523,  713,  650,  587,  714,  651,  715,  268,  332,  269,
+ 396,  333,  270,  460,  397,  334,  271,  461,  398,  335,  462,  399,  463,
+  16,   80,   17,  144,   81,   18,  208,  145,   82,   19,  209,  146,   83,
+ 210,  147,  211,  776,  840,  777,  904,  841,  778,  968,  905,  842,  779,
+ 969,  906,  843,  970,  907,  971,  524,  588,  525,  652,  589,  526,  716,
+ 653,  590,  527,  717,  654,  591,  718,  655,  719,  272,  336,  273,  400,
+ 337,  274,  464,  401,  338,  275,  465,  402,  339,  466,  403,  467,   20,
+  84,   21,  148,   85,   22,  212,  149,   86,   23,  213,  150,   87,  214,
+ 151,  215,  780,  844,  781,  908,  845,  782,  972,  909,  846,  783,  973,
+ 910,  847,  974,  911,  975,  528,  592,  529,  656,  593,  530,  720,  657,
+ 594,  531,  721,  658,  595,  722,  659,  723,  276,  340,  277,  404,  341,
+ 278,  468,  405,  342,  279,  469,  406,  343,  470,  407,  471,   24,   88,
+  25,  152,   89,   26,  216,  153,   90,   27,  217,  154,   91,  218,  155,
+ 219,  784,  848,  785,  912,  849,  786,  976,  913,  850,  787,  977,  914,
+ 851,  978,  915,  979,  532,  596,  533,  660,  597,  534,  724,  661,  598,
+ 535,  725,  662,  599,  726,  663,  727,  280,  344,  281,  408,  345,  282,
+ 472,  409,  346,  283,  473,  410,  347,  474,  411,  475,   28,   92,   29,
+ 156,   93,   30,  220,  157,   94,   31,  221,  158,   95,  222,  159,  223,
+ 788,  852,  789,  916,  853,  790,  980,  917,  854,  791,  981,  918,  855,
+ 982,  919,  983,  536,  600,  537,  664,  601,  538,  728,  665,  602,  539,
+ 729,  666,  603,  730,  667,  731,  284,  348,  285,  412,  349,  286,  476,
+ 413,  350,  287,  477,  414,  351,  478,  415,  479,  792,  856,  793,  920,
+ 857,  794,  984,  921,  858,  795,  985,  922,  859,  986,  923,  987,  540,
+ 604,  541,  668,  605,  542,  732,  669,  606,  543,  733,  670,  607,  734,
+ 671,  735,  796,  860,  797,  924,  861,  798,  988,  925,  862,  799,  989,
+ 926,  863,  990,  927,  991, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,    0,   64,    1,
+ 128,   65,    2,  192,  129,   66,    3,  193,  130,   67,  194,  131,  195,
+ 256,  320,  257,  384,  321,  258,  448,  385,  322,  259,  449,  386,  323,
+ 450,  387,  451,    4,   68,    5,  132,   69,    6,  196,  133,   70,    7,
+ 197,  134,   71,  198,  135,  199,  512,  576,  513,  640,  577,  514,  704,
+ 641,  578,  515,  705,  642,  579,  706,  643,  707,  260,  324,  261,  388,
+ 325,  262,  452,  389,  326,  263,  453,  390,  327,  454,  391,  455,    8,
+  72,    9,  136,   73,   10,  200,  137,   74,   11,  201,  138,   75,  202,
+ 139,  203,  768,  832,  769,  896,  833,  770,  960,  897,  834,  771,  961,
+ 898,  835,  962,  899,  963,  516,  580,  517,  644,  581,  518,  708,  645,
+ 582,  519,  709,  646,  583,  710,  647,  711,  264,  328,  265,  392,  329,
+ 266,  456,  393,  330,  267,  457,  394,  331,  458,  395,  459,   12,   76,
+  13,  140,   77,   14,  204,  141,   78,   15,  205,  142,   79,  206,  143,
+ 207, 1024, 1088, 1025, 1152, 1089, 1026, 1216, 1153, 1090, 1027, 1217, 1154,
+1091, 1218, 1155, 1219,  772,  836,  773,  900,  837,  774,  964,  901,  838,
+ 775,  965,  902,  839,  966,  903,  967,  520,  584,  521,  648,  585,  522,
+ 712,  649,  586,  523,  713,  650,  587,  714,  651,  715,  268,  332,  269,
+ 396,  333,  270,  460,  397,  334,  271,  461,  398,  335,  462,  399,  463,
+  16,   80,   17,  144,   81,   18,  208,  145,   82,   19,  209,  146,   83,
+ 210,  147,  211, 1280, 1344, 1281, 1408, 1345, 1282, 1472, 1409, 1346, 1283,
+1473, 1410, 1347, 1474, 1411, 1475, 1028, 1092, 1029, 1156, 1093, 1030, 1220,
+1157, 1094, 1031, 1221, 1158, 1095, 1222, 1159, 1223,  776,  840,  777,  904,
+ 841,  778,  968,  905,  842,  779,  969,  906,  843,  970,  907,  971,  524,
+ 588,  525,  652,  589,  526,  716,  653,  590,  527,  717,  654,  591,  718,
+ 655,  719,  272,  336,  273,  400,  337,  274,  464,  401,  338,  275,  465,
+ 402,  339,  466,  403,  467,   20,   84,   21,  148,   85,   22,  212,  149,
+  86,   23,  213,  150,   87,  214,  151,  215, 1536, 1600, 1537, 1664, 1601,
+1538, 1728, 1665, 1602, 1539, 1729, 1666, 1603, 1730, 1667, 1731, 1284, 1348,
+1285, 1412, 1349, 1286, 1476, 1413, 1350, 1287, 1477, 1414, 1351, 1478, 1415,
+1479, 1032, 1096, 1033, 1160, 1097, 1034, 1224, 1161, 1098, 1035, 1225, 1162,
+1099, 1226, 1163, 1227,  780,  844,  781,  908,  845,  782,  972,  909,  846,
+ 783,  973,  910,  847,  974,  911,  975,  528,  592,  529,  656,  593,  530,
+ 720,  657,  594,  531,  721,  658,  595,  722,  659,  723,  276,  340,  277,
+ 404,  341,  278,  468,  405,  342,  279,  469,  406,  343,  470,  407,  471,
+  24,   88,   25,  152,   89,   26,  216,  153,   90,   27,  217,  154,   91,
+ 218,  155,  219, 1792, 1856, 1793, 1920, 1857, 1794, 1984, 1921, 1858, 1795,
+1985, 1922, 1859, 1986, 1923, 1987, 1540, 1604, 1541, 1668, 1605, 1542, 1732,
+1669, 1606, 1543, 1733, 1670, 1607, 1734, 1671, 1735, 1288, 1352, 1289, 1416,
+1353, 1290, 1480, 1417, 1354, 1291, 1481, 1418, 1355, 1482, 1419, 1483, 1036,
+1100, 1037, 1164, 1101, 1038, 1228, 1165, 1102, 1039, 1229, 1166, 1103, 1230,
+1167, 1231,  784,  848,  785,  912,  849,  786,  976,  913,  850,  787,  977,
+ 914,  851,  978,  915,  979,  532,  596,  533,  660,  597,  534,  724,  661,
+ 598,  535,  725,  662,  599,  726,  663,  727,  280,  344,  281,  408,  345,
+ 282,  472,  409,  346,  283,  473,  410,  347,  474,  411,  475,   28,   92,
+  29,  156,   93,   30,  220,  157,   94,   31,  221,  158,   95,  222,  159,
+ 223, 1796, 1860, 1797, 1924, 1861, 1798, 1988, 1925, 1862, 1799, 1989, 1926,
+1863, 1990, 1927, 1991, 1544, 1608, 1545, 1672, 1609, 1546, 1736, 1673, 1610,
+1547, 1737, 1674, 1611, 1738, 1675, 1739, 1292, 1356, 1293, 1420, 1357, 1294,
+1484, 1421, 1358, 1295, 1485, 1422, 1359, 1486, 1423, 1487, 1040, 1104, 1041,
+1168, 1105, 1042, 1232, 1169, 1106, 1043, 1233, 1170, 1107, 1234, 1171, 1235,
+ 788,  852,  789,  916,  853,  790,  980,  917,  854,  791,  981,  918,  855,
+ 982,  919,  983,  536,  600,  537,  664,  601,  538,  728,  665,  602,  539,
+ 729,  666,  603,  730,  667,  731,  284,  348,  285,  412,  349,  286,  476,
+ 413,  350,  287,  477,  414,  351,  478,  415,  479, 1800, 1864, 1801, 1928,
+1865, 1802, 1992, 1929, 1866, 1803, 1993, 1930, 1867, 1994, 1931, 1995, 1548,
+1612, 1549, 1676, 1613, 1550, 1740, 1677, 1614, 1551, 1741, 1678, 1615, 1742,
+1679, 1743, 1296, 1360, 1297, 1424, 1361, 1298, 1488, 1425, 1362, 1299, 1489,
+1426, 1363, 1490, 1427, 1491, 1044, 1108, 1045, 1172, 1109, 1046, 1236, 1173,
+1110, 1047, 1237, 1174, 1111, 1238, 1175, 1239,  792,  856,  793,  920,  857,
+ 794,  984,  921,  858,  795,  985,  922,  859,  986,  923,  987,  540,  604,
+ 541,  668,  605,  542,  732,  669,  606,  543,  733,  670,  607,  734,  671,
+ 735, 1804, 1868, 1805, 1932, 1869, 1806, 1996, 1933, 1870, 1807, 1997, 1934,
+1871, 1998, 1935, 1999, 1552, 1616, 1553, 1680, 1617, 1554, 1744, 1681, 1618,
+1555, 1745, 1682, 1619, 1746, 1683, 1747, 1300, 1364, 1301, 1428, 1365, 1302,
+1492, 1429, 1366, 1303, 1493, 1430, 1367, 1494, 1431, 1495, 1048, 1112, 1049,
+1176, 1113, 1050, 1240, 1177, 1114, 1051, 1241, 1178, 1115, 1242, 1179, 1243,
+ 796,  860,  797,  924,  861,  798,  988,  925,  862,  799,  989,  926,  863,
+ 990,  927,  991, 1808, 1872, 1809, 1936, 1873, 1810, 2000, 1937, 1874, 1811,
+2001, 1938, 1875, 2002, 1939, 2003, 1556, 1620, 1557, 1684, 1621, 1558, 1748,
+1685, 1622, 1559, 1749, 1686, 1623, 1750, 1687, 1751, 1304, 1368, 1305, 1432,
+1369, 1306, 1496, 1433, 1370, 1307, 1497, 1434, 1371, 1498, 1435, 1499, 1052,
+1116, 1053, 1180, 1117, 1054, 1244, 1181, 1118, 1055, 1245, 1182, 1119, 1246,
+1183, 1247, 1812, 1876, 1813, 1940, 1877, 1814, 2004, 1941, 1878, 1815, 2005,
+1942, 1879, 2006, 1943, 2007, 1560, 1624, 1561, 1688, 1625, 1562, 1752, 1689,
+1626, 1563, 1753, 1690, 1627, 1754, 1691, 1755, 1308, 1372, 1309, 1436, 1373,
+1310, 1500, 1437, 1374, 1311, 1501, 1438, 1375, 1502, 1439, 1503, 1816, 1880,
+1817, 1944, 1881, 1818, 2008, 1945, 1882, 1819, 2009, 1946, 1883, 2010, 1947,
+2011, 1564, 1628, 1565, 1692, 1629, 1566, 1756, 1693, 1630, 1567, 1757, 1694,
+1631, 1758, 1695, 1759, 1820, 1884, 1821, 1948, 1885, 1822, 2012, 1949, 1886,
+1823, 2013, 1950, 1887, 2014, 1951, 2015, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047,    0,   64,    1,  128,   65,    2,  192,  129,   66,
+   3,  193,  130,   67,  194,  131,  195,  256,  320,  257,  384,  321,  258,
+ 448,  385,  322,  259,  449,  386,  323,  450,  387,  451,    4,   68,    5,
+ 132,   69,    6,  196,  133,   70,    7,  197,  134,   71,  198,  135,  199,
+ 512,  576,  513,  640,  577,  514,  704,  641,  578,  515,  705,  642,  579,
+ 706,  643,  707,  260,  324,  261,  388,  325,  262,  452,  389,  326,  263,
+ 453,  390,  327,  454,  391,  455,    8,   72,    9,  136,   73,   10,  200,
+ 137,   74,   11,  201,  138,   75,  202,  139,  203,  768,  832,  769,  896,
+ 833,  770,  960,  897,  834,  771,  961,  898,  835,  962,  899,  963,  516,
+ 580,  517,  644,  581,  518,  708,  645,  582,  519,  709,  646,  583,  710,
+ 647,  711,  264,  328,  265,  392,  329,  266,  456,  393,  330,  267,  457,
+ 394,  331,  458,  395,  459,   12,   76,   13,  140,   77,   14,  204,  141,
+  78,   15,  205,  142,   79,  206,  143,  207, 1024, 1088, 1025, 1152, 1089,
+1026, 1216, 1153, 1090, 1027, 1217, 1154, 1091, 1218, 1155, 1219,  772,  836,
+ 773,  900,  837,  774,  964,  901,  838,  775,  965,  902,  839,  966,  903,
+ 967,  520,  584,  521,  648,  585,  522,  712,  649,  586,  523,  713,  650,
+ 587,  714,  651,  715,  268,  332,  269,  396,  333,  270,  460,  397,  334,
+ 271,  461,  398,  335,  462,  399,  463,   16,   80,   17,  144,   81,   18,
+ 208,  145,   82,   19,  209,  146,   83,  210,  147,  211, 1280, 1344, 1281,
+1408, 1345, 1282, 1472, 1409, 1346, 1283, 1473, 1410, 1347, 1474, 1411, 1475,
+1028, 1092, 1029, 1156, 1093, 1030, 1220, 1157, 1094, 1031, 1221, 1158, 1095,
+1222, 1159, 1223,  776,  840,  777,  904,  841,  778,  968,  905,  842,  779,
+ 969,  906,  843,  970,  907,  971,  524,  588,  525,  652,  589,  526,  716,
+ 653,  590,  527,  717,  654,  591,  718,  655,  719,  272,  336,  273,  400,
+ 337,  274,  464,  401,  338,  275,  465,  402,  339,  466,  403,  467,   20,
+  84,   21,  148,   85,   22,  212,  149,   86,   23,  213,  150,   87,  214,
+ 151,  215, 1536, 1600, 1537, 1664, 1601, 1538, 1728, 1665, 1602, 1539, 1729,
+1666, 1603, 1730, 1667, 1731, 1284, 1348, 1285, 1412, 1349, 1286, 1476, 1413,
+1350, 1287, 1477, 1414, 1351, 1478, 1415, 1479, 1032, 1096, 1033, 1160, 1097,
+1034, 1224, 1161, 1098, 1035, 1225, 1162, 1099, 1226, 1163, 1227,  780,  844,
+ 781,  908,  845,  782,  972,  909,  846,  783,  973,  910,  847,  974,  911,
+ 975,  528,  592,  529,  656,  593,  530,  720,  657,  594,  531,  721,  658,
+ 595,  722,  659,  723,  276,  340,  277,  404,  341,  278,  468,  405,  342,
+ 279,  469,  406,  343,  470,  407,  471,   24,   88,   25,  152,   89,   26,
+ 216,  153,   90,   27,  217,  154,   91,  218,  155,  219, 1792, 1856, 1793,
+1920, 1857, 1794, 1984, 1921, 1858, 1795, 1985, 1922, 1859, 1986, 1923, 1987,
+1540, 1604, 1541, 1668, 1605, 1542, 1732, 1669, 1606, 1543, 1733, 1670, 1607,
+1734, 1671, 1735, 1288, 1352, 1289, 1416, 1353, 1290, 1480, 1417, 1354, 1291,
+1481, 1418, 1355, 1482, 1419, 1483, 1036, 1100, 1037, 1164, 1101, 1038, 1228,
+1165, 1102, 1039, 1229, 1166, 1103, 1230, 1167, 1231,  784,  848,  785,  912,
+ 849,  786,  976,  913,  850,  787,  977,  914,  851,  978,  915,  979,  532,
+ 596,  533,  660,  597,  534,  724,  661,  598,  535,  725,  662,  599,  726,
+ 663,  727,  280,  344,  281,  408,  345,  282,  472,  409,  346,  283,  473,
+ 410,  347,  474,  411,  475,   28,   92,   29,  156,   93,   30,  220,  157,
+  94,   31,  221,  158,   95,  222,  159,  223, 1796, 1860, 1797, 1924, 1861,
+1798, 1988, 1925, 1862, 1799, 1989, 1926, 1863, 1990, 1927, 1991, 1544, 1608,
+1545, 1672, 1609, 1546, 1736, 1673, 1610, 1547, 1737, 1674, 1611, 1738, 1675,
+1739, 1292, 1356, 1293, 1420, 1357, 1294, 1484, 1421, 1358, 1295, 1485, 1422,
+1359, 1486, 1423, 1487, 1040, 1104, 1041, 1168, 1105, 1042, 1232, 1169, 1106,
+1043, 1233, 1170, 1107, 1234, 1171, 1235,  788,  852,  789,  916,  853,  790,
+ 980,  917,  854,  791,  981,  918,  855,  982,  919,  983,  536,  600,  537,
+ 664,  601,  538,  728,  665,  602,  539,  729,  666,  603,  730,  667,  731,
+ 284,  348,  285,  412,  349,  286,  476,  413,  350,  287,  477,  414,  351,
+ 478,  415,  479, 1800, 1864, 1801, 1928, 1865, 1802, 1992, 1929, 1866, 1803,
+1993, 1930, 1867, 1994, 1931, 1995, 1548, 1612, 1549, 1676, 1613, 1550, 1740,
+1677, 1614, 1551, 1741, 1678, 1615, 1742, 1679, 1743, 1296, 1360, 1297, 1424,
+1361, 1298, 1488, 1425, 1362, 1299, 1489, 1426, 1363, 1490, 1427, 1491, 1044,
+1108, 1045, 1172, 1109, 1046, 1236, 1173, 1110, 1047, 1237, 1174, 1111, 1238,
+1175, 1239,  792,  856,  793,  920,  857,  794,  984,  921,  858,  795,  985,
+ 922,  859,  986,  923,  987,  540,  604,  541,  668,  605,  542,  732,  669,
+ 606,  543,  733,  670,  607,  734,  671,  735, 1804, 1868, 1805, 1932, 1869,
+1806, 1996, 1933, 1870, 1807, 1997, 1934, 1871, 1998, 1935, 1999, 1552, 1616,
+1553, 1680, 1617, 1554, 1744, 1681, 1618, 1555, 1745, 1682, 1619, 1746, 1683,
+1747, 1300, 1364, 1301, 1428, 1365, 1302, 1492, 1429, 1366, 1303, 1493, 1430,
+1367, 1494, 1431, 1495, 1048, 1112, 1049, 1176, 1113, 1050, 1240, 1177, 1114,
+1051, 1241, 1178, 1115, 1242, 1179, 1243,  796,  860,  797,  924,  861,  798,
+ 988,  925,  862,  799,  989,  926,  863,  990,  927,  991, 1808, 1872, 1809,
+1936, 1873, 1810, 2000, 1937, 1874, 1811, 2001, 1938, 1875, 2002, 1939, 2003,
+1556, 1620, 1557, 1684, 1621, 1558, 1748, 1685, 1622, 1559, 1749, 1686, 1623,
+1750, 1687, 1751, 1304, 1368, 1305, 1432, 1369, 1306, 1496, 1433, 1370, 1307,
+1497, 1434, 1371, 1498, 1435, 1499, 1052, 1116, 1053, 1180, 1117, 1054, 1244,
+1181, 1118, 1055, 1245, 1182, 1119, 1246, 1183, 1247, 1812, 1876, 1813, 1940,
+1877, 1814, 2004, 1941, 1878, 1815, 2005, 1942, 1879, 2006, 1943, 2007, 1560,
+1624, 1561, 1688, 1625, 1562, 1752, 1689, 1626, 1563, 1753, 1690, 1627, 1754,
+1691, 1755, 1308, 1372, 1309, 1436, 1373, 1310, 1500, 1437, 1374, 1311, 1501,
+1438, 1375, 1502, 1439, 1503, 1816, 1880, 1817, 1944, 1881, 1818, 2008, 1945,
+1882, 1819, 2009, 1946, 1883, 2010, 1947, 2011, 1564, 1628, 1565, 1692, 1629,
+1566, 1756, 1693, 1630, 1567, 1757, 1694, 1631, 1758, 1695, 1759, 1820, 1884,
+1821, 1948, 1885, 1822, 2012, 1949, 1886, 1823, 2013, 1950, 1887, 2014, 1951,
+2015, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095,
+};
+
+#define SCAN_GROUP_TYPES 2
+#define MAX_LOG2_INDEX 7
+
+const uint32_t* uvg_scan_order[SCAN_GROUP_TYPES][MAX_LOG2_INDEX][MAX_LOG2_INDEX] =
+{
+  {
+    { g_scan_order_buffer + 0, g_scan_order_buffer + 1, g_scan_order_buffer + 3, g_scan_order_buffer + 7, g_scan_order_buffer + 15, g_scan_order_buffer + 31, g_scan_order_buffer + 63, },
+    { g_scan_order_buffer + 127, g_scan_order_buffer + 129, g_scan_order_buffer + 133, g_scan_order_buffer + 141, g_scan_order_buffer + 157, g_scan_order_buffer + 189, g_scan_order_buffer + 253, },
+    { g_scan_order_buffer + 381, g_scan_order_buffer + 385, g_scan_order_buffer + 393, g_scan_order_buffer + 409, g_scan_order_buffer + 441, g_scan_order_buffer + 505, g_scan_order_buffer + 633, },
+    { g_scan_order_buffer + 889, g_scan_order_buffer + 897, g_scan_order_buffer + 913, g_scan_order_buffer + 945, g_scan_order_buffer + 1009, g_scan_order_buffer + 1137, g_scan_order_buffer + 1393, },
+    { g_scan_order_buffer + 1905, g_scan_order_buffer + 1921, g_scan_order_buffer + 1953, g_scan_order_buffer + 2017, g_scan_order_buffer + 2145, g_scan_order_buffer + 2401, g_scan_order_buffer + 2913, },
+    { g_scan_order_buffer + 3937, g_scan_order_buffer + 3969, g_scan_order_buffer + 4033, g_scan_order_buffer + 4161, g_scan_order_buffer + 4417, g_scan_order_buffer + 4929, g_scan_order_buffer + 5953, },
+    { g_scan_order_buffer + 8001, g_scan_order_buffer + 8065, g_scan_order_buffer + 8193, g_scan_order_buffer + 8449, g_scan_order_buffer + 8961, g_scan_order_buffer + 9985, g_scan_order_buffer + 12033, },
+  },
+  {
+    { g_scan_order_buffer + 16129, g_scan_order_buffer + 16130, g_scan_order_buffer + 16132, g_scan_order_buffer + 16136, g_scan_order_buffer + 16144, g_scan_order_buffer + 16160, g_scan_order_buffer + 16192, },
+    { g_scan_order_buffer + 16256, g_scan_order_buffer + 16258, g_scan_order_buffer + 16262, g_scan_order_buffer + 16270, g_scan_order_buffer + 16286, g_scan_order_buffer + 16318, g_scan_order_buffer + 16382, },
+    { g_scan_order_buffer + 16510, g_scan_order_buffer + 16514, g_scan_order_buffer + 16522, g_scan_order_buffer + 16538, g_scan_order_buffer + 16570, g_scan_order_buffer + 16634, g_scan_order_buffer + 16762, },
+    { g_scan_order_buffer + 17018, g_scan_order_buffer + 17026, g_scan_order_buffer + 17042, g_scan_order_buffer + 17074, g_scan_order_buffer + 17138, g_scan_order_buffer + 17266, g_scan_order_buffer + 17522, },
+    { g_scan_order_buffer + 18034, g_scan_order_buffer + 18050, g_scan_order_buffer + 18082, g_scan_order_buffer + 18146, g_scan_order_buffer + 18274, g_scan_order_buffer + 18530, g_scan_order_buffer + 19042, },
+    { g_scan_order_buffer + 20066, g_scan_order_buffer + 20098, g_scan_order_buffer + 20162, g_scan_order_buffer + 20290, g_scan_order_buffer + 20546, g_scan_order_buffer + 21058, g_scan_order_buffer + 22082, },
+    { g_scan_order_buffer + 24130, g_scan_order_buffer + 24194, g_scan_order_buffer + 24322, g_scan_order_buffer + 24578, g_scan_order_buffer + 25090, g_scan_order_buffer + 26114, g_scan_order_buffer + 28162, },
+  }
+};

From 6ff9ae074e1c1c64741003939f3948493721b1c5 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Tue, 9 Aug 2022 14:02:30 +0300
Subject: [PATCH 013/254] [isp] Add scan order getter. Add bookmark comments to
 scan order buffer.

---
 src/encode_coding_tree.c |  5 ++++-
 src/tables.c             | 43 ++++++++++++++++++++++------------------
 src/tables.h             |  8 ++++++++
 3 files changed, 36 insertions(+), 20 deletions(-)

diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index 7a3f401c..a1fd3394 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -271,7 +271,8 @@ void uvg_encode_ts_residual(encoder_state_t* const state,
   uint32_t width,
   uint8_t type,
   int8_t scan_mode,
-  double* bits_out) {
+  double* bits_out) 
+{
   //const encoder_control_t * const encoder = state->encoder_control;
   //int c1 = 1;
   uint32_t i;
@@ -286,6 +287,7 @@ void uvg_encode_ts_residual(encoder_state_t* const state,
   const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_size][log2_block_size][0] + uvg_g_log2_sbb_size[log2_block_size][log2_block_size][1];
   const uint32_t* scan =    uvg_g_sig_last_scan[scan_mode][log2_block_size - 1];
   const uint32_t* scan_cg = g_sig_last_scan_cg[log2_block_size - 1][scan_mode];
+
   double bits = 0;
 
   // Init base contexts according to block type
@@ -297,6 +299,7 @@ void uvg_encode_ts_residual(encoder_state_t* const state,
   unsigned scan_cg_last = (unsigned )-1;
   //unsigned scan_pos_last = (unsigned )-1;
 
+  // ISP_TODO: height
   for (i = 0; i < width * width; i++) {
     if (coeff[scan[i]]) {
       //scan_pos_last = i;
diff --git a/src/tables.c b/src/tables.c
index 8c42964d..39b4f509 100644
--- a/src/tables.c
+++ b/src/tables.c
@@ -84,8 +84,8 @@ const uint32_t* const uvg_g_sig_last_scan[3][5] = {
 };
 
 // Holds scan order indices for all possible block sizes for diagonal scan order and coefficient group scan order
-static const uint32_t* const g_scan_order_buffer[32258] = {
-   0,    0,    1,    0,    1,    2,    3,    0,    1,    2,    3,    4,    5,
+static const uint32_t const g_scan_order_buffer[32258] = {
+   0,    0,    1,    0,    1,    2,    3,    0,    1,    2,    3,    4,    5, // 1xN
    6,    7,    0,    1,    2,    3,    4,    5,    6,    7,    8,    9,   10,
   11,   12,   13,   14,   15,    0,    1,    2,    3,    4,    5,    6,    7,
    8,    9,   10,   11,   12,   13,   14,   15,   16,   17,   18,   19,   20,
@@ -94,7 +94,7 @@ static const uint32_t* const g_scan_order_buffer[32258] = {
   15,   16,   17,   18,   19,   20,   21,   22,   23,   24,   25,   26,   27,
   28,   29,   30,   31,   32,   33,   34,   35,   36,   37,   38,   39,   40,
   41,   42,   43,   44,   45,   46,   47,   48,   49,   50,   51,   52,   53,
-  54,   55,   56,   57,   58,   59,   60,   61,   62,   63,    0,    1,    0,
+  54,   55,   56,   57,   58,   59,   60,   61,   62,   63,    0,    1,    0, // 2xN
    2,    1,    3,    0,    2,    1,    4,    3,    6,    5,    7,    0,    2,
    1,    4,    3,    6,    5,    8,    7,   10,    9,   12,   11,   14,   13,
   15,    0,    2,    1,    4,    3,    6,    5,    8,    7,   10,    9,   12,
@@ -114,7 +114,7 @@ static const uint32_t* const g_scan_order_buffer[32258] = {
   86,   85,   88,   87,   90,   89,   92,   91,   94,   93,   96,   95,   98,
   97,  100,   99,  102,  101,  104,  103,  106,  105,  108,  107,  110,  109,
  112,  111,  114,  113,  116,  115,  118,  117,  120,  119,  122,  121,  124,
- 123,  126,  125,  127,    0,    1,    2,    3,    0,    4,    1,    5,    2,
+ 123,  126,  125,  127,    0,    1,    2,    3,    0,    4,    1,    5,    2, // 4xN
    6,    3,    7,    0,    4,    1,    8,    5,    2,   12,    9,    6,    3,
   13,   10,    7,   14,   11,   15,    0,    4,    1,    8,    5,    2,   12,
    9,    6,    3,   16,   13,   10,    7,   20,   17,   14,   11,   24,   21,
@@ -153,7 +153,7 @@ static const uint32_t* const g_scan_order_buffer[32258] = {
  210,  207,  220,  217,  214,  211,  224,  221,  218,  215,  228,  225,  222,
  219,  232,  229,  226,  223,  236,  233,  230,  227,  240,  237,  234,  231,
  244,  241,  238,  235,  248,  245,  242,  239,  252,  249,  246,  243,  253,
- 250,  247,  254,  251,  255,    0,    1,    2,    3,    4,    5,    6,    7,
+ 250,  247,  254,  251,  255,    0,    1,    2,    3,    4,    5,    6,    7, // 8xN
    0,    8,    1,    9,    2,   10,    3,   11,    4,   12,    5,   13,    6,
   14,    7,   15,    0,    8,    1,   16,    9,    2,   24,   17,   10,    3,
   25,   18,   11,    4,   26,   19,   12,    5,   27,   20,   13,    6,   28,
@@ -231,7 +231,7 @@ static const uint32_t* const g_scan_order_buffer[32258] = {
  446,  439,  496,  489,  482,  475,  468,  461,  454,  447,  504,  497,  490,
  483,  476,  469,  462,  455,  505,  498,  491,  484,  477,  470,  463,  506,
  499,  492,  485,  478,  471,  507,  500,  493,  486,  479,  508,  501,  494,
- 487,  509,  502,  495,  510,  503,  511,    0,    1,    2,    3,    4,    5,
+ 487,  509,  502,  495,  510,  503,  511,    0,    1,    2,    3,    4,    5, // 16xN
    6,    7,    8,    9,   10,   11,   12,   13,   14,   15,    0,   16,    1,
   17,    2,   18,    3,   19,    4,   20,    5,   21,    6,   22,    7,   23,
    8,   24,    9,   25,   10,   26,   11,   27,   12,   28,   13,   29,   14,
@@ -387,7 +387,7 @@ static const uint32_t* const g_scan_order_buffer[32258] = {
  939,  924,  909,  894,  879, 1015, 1000,  985,  970,  955,  940,  925,  910,
  895, 1016, 1001,  986,  971,  956,  941,  926,  911, 1017, 1002,  987,  972,
  957,  942,  927, 1018, 1003,  988,  973,  958,  943, 1019, 1004,  989,  974,
- 959, 1020, 1005,  990,  975, 1021, 1006,  991, 1022, 1007, 1023,    0,    1,
+ 959, 1020, 1005,  990,  975, 1021, 1006,  991, 1022, 1007, 1023,    0,    1, // 32xN
    2,    3,    4,    5,    6,    7,    8,    9,   10,   11,   12,   13,   14,
   15,   16,   17,   18,   19,   20,   21,   22,   23,   24,   25,   26,   27,
   28,   29,   30,   31,    0,   32,    1,   33,    2,   34,    3,   35,    4,
@@ -700,7 +700,7 @@ static const uint32_t* const g_scan_order_buffer[32258] = {
 2039, 2008, 1977, 1946, 1915, 1884, 1853, 1822, 1791, 2040, 2009, 1978, 1947,
 1916, 1885, 1854, 1823, 2041, 2010, 1979, 1948, 1917, 1886, 1855, 2042, 2011,
 1980, 1949, 1918, 1887, 2043, 2012, 1981, 1950, 1919, 2044, 2013, 1982, 1951,
-2045, 2014, 1983, 2046, 2015, 2047,    0,    1,    2,    3,    4,    5,    6,
+2045, 2014, 1983, 2046, 2015, 2047,    0,    1,    2,    3,    4,    5,    6, // 64xN
    7,    8,    9,   10,   11,   12,   13,   14,   15,   16,   17,   18,   19,
   20,   21,   22,   23,   24,   25,   26,   27,   28,   29,   30,   31,   32,
   33,   34,   35,   36,   37,   38,   39,   40,   41,   42,   43,   44,   45,
@@ -1325,7 +1325,7 @@ static const uint32_t* const g_scan_order_buffer[32258] = {
 3645, 3582, 3519, 4087, 4024, 3961, 3898, 3835, 3772, 3709, 3646, 3583, 4088,
 4025, 3962, 3899, 3836, 3773, 3710, 3647, 4089, 4026, 3963, 3900, 3837, 3774,
 3711, 4090, 4027, 3964, 3901, 3838, 3775, 4091, 4028, 3965, 3902, 3839, 4092,
-4029, 3966, 3903, 4093, 4030, 3967, 4094, 4031, 4095,    0,    0,    1,    0,
+4029, 3966, 3903, 4093, 4030, 3967, 4094, 4031, 4095,    0,    0,    1,    0, // 1xN, coef groups
    1,    2,    3,    0,    1,    2,    3,    4,    5,    6,    7,    0,    1,
    2,    3,    4,    5,    6,    7,    8,    9,   10,   11,   12,   13,   14,
   15,    0,    1,    2,    3,    4,    5,    6,    7,    8,    9,   10,   11,
@@ -1335,7 +1335,7 @@ static const uint32_t* const g_scan_order_buffer[32258] = {
   19,   20,   21,   22,   23,   24,   25,   26,   27,   28,   29,   30,   31,
   63,   63,   63,   63,   63,   63,   63,   63,   63,   63,   63,   63,   63,
   63,   63,   63,   63,   63,   63,   63,   63,   63,   63,   63,   63,   63,
-  63,   63,   63,   63,   63,   63,    0,    1,    0,    2,    1,    3,    0,
+  63,   63,   63,   63,   63,   63,    0,    1,    0,    2,    1,    3,    0, // 2xN
    2,    1,    3,    4,    6,    5,    7,    0,    2,    1,    4,    3,    6,
    5,    8,    7,   10,    9,   12,   11,   14,   13,   15,    0,    2,    1,
    4,    3,    6,    5,    8,    7,   10,    9,   12,   11,   14,   13,   15,
@@ -1355,7 +1355,7 @@ static const uint32_t* const g_scan_order_buffer[32258] = {
  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,
  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,
  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,
-   0,    1,    2,    3,    0,    4,    1,    5,    2,    6,    3,    7,    0,
+   0,    1,    2,    3,    0,    4,    1,    5,    2,    6,    3,    7,    0, // 4xN
    4,    1,    8,    5,    2,   12,    9,    6,    3,   13,   10,    7,   14,
   11,   15,    0,    4,    1,    8,    5,    2,   12,    9,    6,    3,   13,
   10,    7,   14,   11,   15,   16,   20,   17,   24,   21,   18,   28,   25,
@@ -1394,7 +1394,7 @@ static const uint32_t* const g_scan_order_buffer[32258] = {
  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,
  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,
  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,
- 255,    0,    1,    2,    3,    4,    5,    6,    7,    0,    8,    1,    9,
+ 255,    0,    1,    2,    3,    4,    5,    6,    7,    0,    8,    1,    9, // 8xN
    2,   10,    3,   11,    4,   12,    5,   13,    6,   14,    7,   15,    0,
    8,    1,   16,    9,    2,   24,   17,   10,    3,   25,   18,   11,   26,
   19,   27,    4,   12,    5,   20,   13,    6,   28,   21,   14,    7,   29,
@@ -1472,7 +1472,7 @@ static const uint32_t* const g_scan_order_buffer[32258] = {
  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
- 511,  511,  511,    0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
+ 511,  511,  511,    0,    1,    2,    3,    4,    5,    6,    7,    8,    9, // 16xN
   10,   11,   12,   13,   14,   15,    0,   16,    1,   17,    2,   18,    3,
   19,    4,   20,    5,   21,    6,   22,    7,   23,    8,   24,    9,   25,
   10,   26,   11,   27,   12,   28,   13,   29,   14,   30,   15,   31,    0,
@@ -1628,7 +1628,7 @@ static const uint32_t* const g_scan_order_buffer[32258] = {
 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
-1023, 1023, 1023, 1023, 1023, 1023, 1023,    0,    1,    2,    3,    4,    5,
+1023, 1023, 1023, 1023, 1023, 1023, 1023,    0,    1,    2,    3,    4,    5, // 32xN
    6,    7,    8,    9,   10,   11,   12,   13,   14,   15,   16,   17,   18,
   19,   20,   21,   22,   23,   24,   25,   26,   27,   28,   29,   30,   31,
    0,   32,    1,   33,    2,   34,    3,   35,    4,   36,    5,   37,    6,
@@ -1941,7 +1941,7 @@ static const uint32_t* const g_scan_order_buffer[32258] = {
 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
-2047, 2047,    0,    1,    2,    3,    4,    5,    6,    7,    8,    9,   10,
+2047, 2047,    0,    1,    2,    3,    4,    5,    6,    7,    8,    9,   10, // 64xN
   11,   12,   13,   14,   15,   16,   17,   18,   19,   20,   21,   22,   23,
   24,   25,   26,   27,   28,   29,   30,   31,   63,   63,   63,   63,   63,
   63,   63,   63,   63,   63,   63,   63,   63,   63,   63,   63,   63,   63,
@@ -2569,10 +2569,9 @@ static const uint32_t* const g_scan_order_buffer[32258] = {
 4095, 4095, 4095, 4095, 4095,
 };
 
-#define SCAN_GROUP_TYPES 2
-#define MAX_LOG2_INDEX 7
-
-const uint32_t* uvg_scan_order[SCAN_GROUP_TYPES][MAX_LOG2_INDEX][MAX_LOG2_INDEX] =
+// Get scan order table based on scan group index (diagonal or coef group)
+// and log2 block width and height index
+static const uint32_t* g_scan_order[SCAN_GROUP_TYPES][MAX_LOG2_INDEX][MAX_LOG2_INDEX] =
 {
   {
     { g_scan_order_buffer + 0, g_scan_order_buffer + 1, g_scan_order_buffer + 3, g_scan_order_buffer + 7, g_scan_order_buffer + 15, g_scan_order_buffer + 31, g_scan_order_buffer + 63, },
@@ -2593,3 +2592,9 @@ const uint32_t* uvg_scan_order[SCAN_GROUP_TYPES][MAX_LOG2_INDEX][MAX_LOG2_INDEX]
     { g_scan_order_buffer + 24130, g_scan_order_buffer + 24194, g_scan_order_buffer + 24322, g_scan_order_buffer + 24578, g_scan_order_buffer + 25090, g_scan_order_buffer + 26114, g_scan_order_buffer + 28162, },
   }
 };
+
+uint32_t* uvg_get_scan_order(int scan_group, int log2_w, int log2_h)
+{
+  // ISP_TODO: returning coef group type does not work yet. It will break for non-square blocks
+  return g_scan_order[scan_group][log2_w][log2_h];
+}
diff --git a/src/tables.h b/src/tables.h
index 1ab81cfb..8c94a7cb 100644
--- a/src/tables.h
+++ b/src/tables.h
@@ -136,4 +136,12 @@ extern const uint32_t* const uvg_g_sig_last_scan[3][5];
 extern const int8_t uvg_g_convert_to_bit[LCU_WIDTH + 1];
 extern const uint32_t uvg_g_log2_sbb_size[7 + 1][7 + 1][2];
 
+#define SCAN_GROUP_TYPES 2
+#define MAX_LOG2_INDEX 7
+
+#define SCAN_GROUP_DIAG 0
+#define SCAN_GROUP_COEF 1
+
+uint32_t* uvg_get_scan_order(int scan_group, int log2_w, int log2_h);
+
 #endif //TABLES_H_

From 8131e970e5cfb17862260702821bb2bb4c4eb139 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Wed, 10 Aug 2022 19:35:15 +0300
Subject: [PATCH 014/254] [isp] Modify existing scan table calls to use new
 getter. Add safety assert to getter.

---
 src/encode_coding_tree.c                      | 13 ++++---
 src/rdo.c                                     | 35 +++++++++++--------
 src/search_intra.c                            | 16 ++++++---
 .../generic/encode_coding_tree-generic.c      | 12 ++++---
 src/tables.c                                  | 25 +++++++++++--
 src/tables.h                                  |  4 +--
 6 files changed, 72 insertions(+), 33 deletions(-)

diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index a1fd3394..6816ab26 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -283,10 +283,14 @@ void uvg_encode_ts_residual(encoder_state_t* const state,
 
   // CONSTANTS
 
-  const uint32_t log2_block_size = uvg_g_convert_to_bit[width] + 2;
-  const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_size][log2_block_size][0] + uvg_g_log2_sbb_size[log2_block_size][log2_block_size][1];
-  const uint32_t* scan =    uvg_g_sig_last_scan[scan_mode][log2_block_size - 1];
-  const uint32_t* scan_cg = g_sig_last_scan_cg[log2_block_size - 1][scan_mode];
+  const uint32_t log2_block_width = uvg_g_convert_to_bit[width] + 2;
+  const uint32_t log2_block_height = log2_block_width; // ISP_TODO: height
+  const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_width][log2_block_width][0] + uvg_g_log2_sbb_size[log2_block_width][log2_block_width][1];
+  //const uint32_t* scan =    uvg_g_sig_last_scan[scan_mode][log2_block_size - 1];
+  //const uint32_t* scan_cg = g_sig_last_scan_cg[log2_block_size - 1][scan_mode];
+
+  const uint32_t* scan = uvg_get_scan_order_table(SCAN_GROUP_NORM, scan_mode, log2_block_width, log2_block_height);
+  const uint32_t* scan_cg = uvg_get_scan_order_table(SCAN_GROUP_COEF, scan_mode, log2_block_width, log2_block_height);
 
   double bits = 0;
 
@@ -295,6 +299,7 @@ void uvg_encode_ts_residual(encoder_state_t* const state,
 
   cabac->cur_ctx = base_coeff_group_ctx;
   
+  // ISP_TODO: height
   int maxCtxBins = (width * width * 7) >> 2;
   unsigned scan_cg_last = (unsigned )-1;
   //unsigned scan_pos_last = (unsigned )-1;
diff --git a/src/rdo.c b/src/rdo.c
index aa78c697..a9202150 100644
--- a/src/rdo.c
+++ b/src/rdo.c
@@ -1141,7 +1141,8 @@ int uvg_ts_rdoq(encoder_state_t* const state, coeff_t* src_coeff, coeff_t* dest_
   const int  max_log2_tr_dynamic_range = 15;
   uint32_t log2_tr_width = uvg_math_floor_log2(width);
   uint32_t log2_tr_height = uvg_math_floor_log2(height);
-  const uint32_t log2_block_size = uvg_g_convert_to_bit[width] + 2;
+  const uint32_t log2_block_width = uvg_g_convert_to_bit[width] + 2;
+  const uint32_t log2_block_height = uvg_g_convert_to_bit[height] + 2;
   const uint32_t log2_cg_width = g_log2_sbb_size[log2_tr_width][log2_tr_height][0];
   const uint32_t log2_cg_height = g_log2_sbb_size[log2_tr_width][log2_tr_height][1];
 
@@ -1182,8 +1183,10 @@ int uvg_ts_rdoq(encoder_state_t* const state, coeff_t* src_coeff, coeff_t* dest_
 
   const coeff_t entropy_coding_maximum = (1 << max_log2_tr_dynamic_range) - 1;
 
-  const uint32_t* scan = uvg_g_sig_last_scan[scan_mode][log2_block_size - 1];
-  const uint32_t* scan_cg = g_sig_last_scan_cg[log2_block_size - 1][scan_mode];
+  //const uint32_t* scan = uvg_g_sig_last_scan[scan_mode][log2_block_size - 1];
+  //const uint32_t* scan_cg = g_sig_last_scan_cg[log2_block_size - 1][scan_mode];
+  const uint32_t* scan = uvg_get_scan_order_table(SCAN_GROUP_NORM, scan_mode, log2_block_width, log2_block_height);
+  const uint32_t* scan_cg = uvg_get_scan_order_table(SCAN_GROUP_COEF, scan_mode, log2_block_width, log2_block_height);
 
   uint32_t coeff_levels[3];
   double   coeff_level_error[4];
@@ -1221,8 +1224,8 @@ int uvg_ts_rdoq(encoder_state_t* const state, coeff_t* src_coeff, coeff_t* dest_
       scan_pos = (sbId << log2_cg_size) + scan_pos_in_sb;
       int last_pos_coded = sbSizeM1;
       uint32_t blkpos = scan[scan_pos];
-      uint32_t  pos_y = blkpos >> log2_block_size;
-      uint32_t  pos_x = blkpos - (pos_y << log2_block_size);
+      uint32_t  pos_y = blkpos >> log2_block_width;
+      uint32_t  pos_x = blkpos - (pos_y << log2_block_width); // TODO: height
       //===== quantization =====
 
       // set coeff
@@ -1391,7 +1394,8 @@ void uvg_rdoq(
   int32_t  transform_shift   = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_height + log2_tr_width) >> 1);  // Represents scaling through forward transform
   uint16_t go_rice_param     = 0;
   uint32_t reg_bins = (width * height * 28) >> 4;
-  const uint32_t log2_block_size   = uvg_g_convert_to_bit[ width ] + 2;
+  const uint32_t log2_block_width   = uvg_g_convert_to_bit[width] + 2;
+  const uint32_t log2_block_height = uvg_g_convert_to_bit[height] + 2;
   int32_t  scalinglist_type= (block_type == CU_INTRA ? 0 : 3) + type;
 
   int32_t qp_scaled = uvg_get_scaled_qp(type, state->qp, (encoder->bitdepth - 8) * 6, encoder->qp_map[0]);
@@ -1415,11 +1419,13 @@ void uvg_rdoq(
 
   memset(dest_coeff, 0, sizeof(coeff_t) * width * height);
 
-  const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_size][log2_block_size][0] + uvg_g_log2_sbb_size[log2_block_size][log2_block_size][1];
+  // ISP_TODO: height
+  const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_width][log2_block_width][0] + uvg_g_log2_sbb_size[log2_block_width][log2_block_width][1];
 
   const uint32_t cg_width = (MIN((uint8_t)32, width) >> (log2_cg_size / 2));
 
-  const uint32_t *scan_cg = g_sig_last_scan_cg[log2_block_size - 1][scan_mode];
+  //const uint32_t *scan_cg = g_sig_last_scan_cg[log2_block_size - 1][scan_mode];
+  const uint32_t *scan_cg = uvg_get_scan_order_table(SCAN_GROUP_COEF, scan_mode, log2_block_width, log2_block_height);
   const uint32_t cg_size = 16;
   const int32_t  shift = 4 >> 1;
   const uint32_t num_blk_side = width >> shift;
@@ -1431,8 +1437,9 @@ void uvg_rdoq(
   int32_t temp_diag = -1;
   int32_t temp_sum = -1;
 
-  const uint32_t *scan = uvg_g_sig_last_scan[ scan_mode ][ log2_block_size - 1 ];
-
+  //const uint32_t *scan = uvg_g_sig_last_scan[ scan_mode ][ log2_block_size - 1 ];
+  const uint32_t *scan = uvg_get_scan_order_table(SCAN_GROUP_NORM, scan_mode, log2_block_width, log2_block_height);
+  
   int32_t cg_last_scanpos = -1;
   int32_t last_scanpos = -1;
 
@@ -1527,8 +1534,8 @@ void uvg_rdoq(
 
       if (last_scanpos >= 0) {
 
-        uint32_t  pos_y = blkpos >> log2_block_size;
-        uint32_t  pos_x = blkpos - (pos_y << log2_block_size);
+        uint32_t  pos_y = blkpos >> log2_block_width;
+        uint32_t  pos_x = blkpos - (pos_y << log2_block_width); // ISP_TODO: height
         //===== coefficient level estimation =====
         int32_t  level;
         
@@ -1715,8 +1722,8 @@ void uvg_rdoq(
         uint32_t blkpos  = scan[scanpos];
 
         if( dest_coeff[ blkpos ] ) {
-          uint32_t   pos_y = blkpos >> log2_block_size;
-          uint32_t   pos_x = blkpos - ( pos_y << log2_block_size );
+          uint32_t   pos_y = blkpos >> log2_block_width;
+          uint32_t   pos_x = blkpos - ( pos_y << log2_block_width ); // ISP_TODO: height
 
           double cost_last = get_rate_last(lambda, pos_x, pos_y, last_x_bits,last_y_bits );
           double totalCost = base_cost + cost_last - cost_sig[ scanpos ];
diff --git a/src/search_intra.c b/src/search_intra.c
index 06b86cc7..e5b23d7d 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -194,21 +194,27 @@ static void derive_mts_constraints(cu_info_t *const pred_cu,
                                    const vector2d_t lcu_px)
 {
   const int width = LCU_WIDTH >> depth;
+  const int height = width; // ISP_TODO: height
   int8_t scan_idx = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode, depth);
   int32_t i;
   // ToDo: large block support in VVC?
   uint32_t sig_coeffgroup_flag[32 * 32] = { 0 };
 
-  const uint32_t log2_block_size = uvg_g_convert_to_bit[width] + 2;
-  const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_size][log2_block_size][0]
-    + uvg_g_log2_sbb_size[log2_block_size][log2_block_size][1];
-  const uint32_t *scan = uvg_g_sig_last_scan[scan_idx][log2_block_size - 1];
-  const uint32_t *scan_cg = g_sig_last_scan_cg[log2_block_size - 1][scan_idx];
+  const uint32_t log2_block_width = uvg_g_convert_to_bit[width] + 2;
+  const uint32_t log2_block_height = uvg_g_convert_to_bit[height] + 2;
+  const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_width][log2_block_width][0]
+    + uvg_g_log2_sbb_size[log2_block_width][log2_block_width][1]; // ISP_TODO: height
+  //const uint32_t *scan = uvg_g_sig_last_scan[scan_idx][log2_block_size - 1];
+  //const uint32_t *scan_cg = g_sig_last_scan_cg[log2_block_size - 1][scan_idx];
+  const uint32_t *scan = uvg_get_scan_order_table(SCAN_GROUP_NORM, scan_idx, log2_block_width, log2_block_height);
+  const uint32_t *scan_cg = uvg_get_scan_order_table(SCAN_GROUP_COEF, scan_idx, log2_block_width, log2_block_height);
+
   const coeff_t* coeff = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, lcu_px.x, lcu_px.y)];
 
   signed scan_cg_last = -1;
   signed scan_pos_last = -1;
 
+  // ISP_TODO: height
   for (int i = 0; i < width * width; i++) {
     if (coeff[scan[i]]) {
       scan_pos_last = i;
diff --git a/src/strategies/generic/encode_coding_tree-generic.c b/src/strategies/generic/encode_coding_tree-generic.c
index 67685f2f..dcd88fef 100644
--- a/src/strategies/generic/encode_coding_tree-generic.c
+++ b/src/strategies/generic/encode_coding_tree-generic.c
@@ -76,11 +76,13 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
   // CONSTANTS
 
   const int height = width; // TODO: height for non-square blocks.
-  const uint32_t log2_block_size = uvg_g_convert_to_bit[width]+2;
-  const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_size][log2_block_size][0] + uvg_g_log2_sbb_size[log2_block_size][log2_block_size][1];
-  const uint32_t *scan =
-    uvg_g_sig_last_scan[scan_mode][log2_block_size - 1];
-  const uint32_t *scan_cg = g_sig_last_scan_cg[log2_block_size - 1][scan_mode];
+  const uint32_t log2_block_width = uvg_g_convert_to_bit[width]+2;
+  const uint32_t log2_block_height = uvg_g_convert_to_bit[width] + 2; // ISP_TODO: height
+  const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_width][log2_block_width][0] + uvg_g_log2_sbb_size[log2_block_width][log2_block_width][1];
+  //const uint32_t *scan = uvg_g_sig_last_scan[scan_mode][log2_block_size - 1];
+  //const uint32_t *scan_cg = g_sig_last_scan_cg[log2_block_size - 1][scan_mode];
+  const uint32_t* scan = uvg_get_scan_order_table(SCAN_GROUP_NORM, scan_mode, log2_block_width, log2_block_height);
+  const uint32_t* scan_cg = uvg_get_scan_order_table(SCAN_GROUP_COEF, scan_mode, log2_block_width, log2_block_height);
 
 
   // Init base contexts according to block type
diff --git a/src/tables.c b/src/tables.c
index 39b4f509..4d3d8485 100644
--- a/src/tables.c
+++ b/src/tables.c
@@ -2593,8 +2593,27 @@ static const uint32_t* g_scan_order[SCAN_GROUP_TYPES][MAX_LOG2_INDEX][MAX_LOG2_I
   }
 };
 
-uint32_t* uvg_get_scan_order(int scan_group, int log2_w, int log2_h)
+
+/**
+ * \brief Return array of scan order indices.
+ *
+ * \param scan_group  Scan group type, normal or coefficient.
+ * \param scan_type   Scan type, diagonal, horizontal or vertical
+ * \param log2_w      Log2 of block width.
+ * \param log2_h      Log2 of block height.
+ *
+ * \return  Returns pointer to scan order table based on given dimensions.
+ */
+uint32_t* uvg_get_scan_order_table(int scan_group, int scan_type, int log2_w, int log2_h)
 {
-  // ISP_TODO: returning coef group type does not work yet. It will break for non-square blocks
-  return g_scan_order[scan_group][log2_w][log2_h];
+  // ISP_TODO: horizontal and vertical scan types
+  assert(scan_type == SCAN_DIAG && "Horizontal and vertical scan not implemented.");
+
+  if (scan_group == SCAN_GROUP_NORM) {
+    return g_scan_order[scan_group][log2_w][log2_h];
+  }
+  else {
+    // ISP_TODO: returning coef group type does not work yet. It will break for non-square blocks
+    return g_scan_order[scan_group][log2_w >> 2][log2_h >> 2];
+  }
 }
diff --git a/src/tables.h b/src/tables.h
index 8c94a7cb..29914374 100644
--- a/src/tables.h
+++ b/src/tables.h
@@ -139,9 +139,9 @@ extern const uint32_t uvg_g_log2_sbb_size[7 + 1][7 + 1][2];
 #define SCAN_GROUP_TYPES 2
 #define MAX_LOG2_INDEX 7
 
-#define SCAN_GROUP_DIAG 0
+#define SCAN_GROUP_NORM 0
 #define SCAN_GROUP_COEF 1
 
-uint32_t* uvg_get_scan_order(int scan_group, int log2_w, int log2_h);
+uint32_t* uvg_get_scan_order_table(int scan_group, int scan_type, int log2_w, int log2_h);
 
 #endif //TABLES_H_

From c4d1f80f8f44a71b1a57dd3e7ee0243f6b74bccd Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Wed, 10 Aug 2022 19:37:09 +0300
Subject: [PATCH 015/254] [isp] Fix error in scan order getter. Change define
 names to better reflect what they do. Add more accurate bookmark comments to
 scan order buffer table.

---
 src/tables.c | 160 +++++++++++++++++++++++++--------------------------
 src/tables.h |   4 +-
 2 files changed, 82 insertions(+), 82 deletions(-)

diff --git a/src/tables.c b/src/tables.c
index 4d3d8485..0f5bfe31 100644
--- a/src/tables.c
+++ b/src/tables.c
@@ -85,26 +85,26 @@ const uint32_t* const uvg_g_sig_last_scan[3][5] = {
 
 // Holds scan order indices for all possible block sizes for diagonal scan order and coefficient group scan order
 static const uint32_t const g_scan_order_buffer[32258] = {
-   0,    0,    1,    0,    1,    2,    3,    0,    1,    2,    3,    4,    5, // 1xN
-   6,    7,    0,    1,    2,    3,    4,    5,    6,    7,    8,    9,   10,
-  11,   12,   13,   14,   15,    0,    1,    2,    3,    4,    5,    6,    7,
+   0,    0,    1,    0,    1,    2,    3,    0,    1,    2,    3,    4,    5, // UNGROUPED 1xN, 1x2, 1x4, 1x8
+   6,    7,    0,    1,    2,    3,    4,    5,    6,    7,    8,    9,   10, // 1x16
+  11,   12,   13,   14,   15,    0,    1,    2,    3,    4,    5,    6,    7, // 1x32
    8,    9,   10,   11,   12,   13,   14,   15,   16,   17,   18,   19,   20,
-  21,   22,   23,   24,   25,   26,   27,   28,   29,   30,   31,    0,    1,
+  21,   22,   23,   24,   25,   26,   27,   28,   29,   30,   31,    0,    1, // 1x64
    2,    3,    4,    5,    6,    7,    8,    9,   10,   11,   12,   13,   14,
   15,   16,   17,   18,   19,   20,   21,   22,   23,   24,   25,   26,   27,
   28,   29,   30,   31,   32,   33,   34,   35,   36,   37,   38,   39,   40,
   41,   42,   43,   44,   45,   46,   47,   48,   49,   50,   51,   52,   53,
-  54,   55,   56,   57,   58,   59,   60,   61,   62,   63,    0,    1,    0, // 2xN
-   2,    1,    3,    0,    2,    1,    4,    3,    6,    5,    7,    0,    2,
+  54,   55,   56,   57,   58,   59,   60,   61,   62,   63,    0,    1,    0, // 2xN, 2x2
+   2,    1,    3,    0,    2,    1,    4,    3,    6,    5,    7,    0,    2, // 2x4, 2x8
    1,    4,    3,    6,    5,    8,    7,   10,    9,   12,   11,   14,   13,
-  15,    0,    2,    1,    4,    3,    6,    5,    8,    7,   10,    9,   12,
+  15,    0,    2,    1,    4,    3,    6,    5,    8,    7,   10,    9,   12, // 2x16
   11,   14,   13,   16,   15,   18,   17,   20,   19,   22,   21,   24,   23,
-  26,   25,   28,   27,   30,   29,   31,    0,    2,    1,    4,    3,    6,
+  26,   25,   28,   27,   30,   29,   31,    0,    2,    1,    4,    3,    6, // 2x32
    5,    8,    7,   10,    9,   12,   11,   14,   13,   16,   15,   18,   17,
   20,   19,   22,   21,   24,   23,   26,   25,   28,   27,   30,   29,   32,
   31,   34,   33,   36,   35,   38,   37,   40,   39,   42,   41,   44,   43,
   46,   45,   48,   47,   50,   49,   52,   51,   54,   53,   56,   55,   58,
-  57,   60,   59,   62,   61,   63,    0,    2,    1,    4,    3,    6,    5,
+  57,   60,   59,   62,   61,   63,    0,    2,    1,    4,    3,    6,    5, // 2x64
    8,    7,   10,    9,   12,   11,   14,   13,   16,   15,   18,   17,   20,
   19,   22,   21,   24,   23,   26,   25,   28,   27,   30,   29,   32,   31,
   34,   33,   36,   35,   38,   37,   40,   39,   42,   41,   44,   43,   46,
@@ -114,16 +114,16 @@ static const uint32_t const g_scan_order_buffer[32258] = {
   86,   85,   88,   87,   90,   89,   92,   91,   94,   93,   96,   95,   98,
   97,  100,   99,  102,  101,  104,  103,  106,  105,  108,  107,  110,  109,
  112,  111,  114,  113,  116,  115,  118,  117,  120,  119,  122,  121,  124,
- 123,  126,  125,  127,    0,    1,    2,    3,    0,    4,    1,    5,    2, // 4xN
-   6,    3,    7,    0,    4,    1,    8,    5,    2,   12,    9,    6,    3,
-  13,   10,    7,   14,   11,   15,    0,    4,    1,    8,    5,    2,   12,
+ 123,  126,  125,  127,    0,    1,    2,    3,    0,    4,    1,    5,    2, // 4xN, 4x2
+   6,    3,    7,    0,    4,    1,    8,    5,    2,   12,    9,    6,    3, // 4x4
+  13,   10,    7,   14,   11,   15,    0,    4,    1,    8,    5,    2,   12, // 4x8
    9,    6,    3,   16,   13,   10,    7,   20,   17,   14,   11,   24,   21,
-  18,   15,   28,   25,   22,   19,   29,   26,   23,   30,   27,   31,    0,
+  18,   15,   28,   25,   22,   19,   29,   26,   23,   30,   27,   31,    0, // 4x16
    4,    1,    8,    5,    2,   12,    9,    6,    3,   16,   13,   10,    7,
   20,   17,   14,   11,   24,   21,   18,   15,   28,   25,   22,   19,   32,
   29,   26,   23,   36,   33,   30,   27,   40,   37,   34,   31,   44,   41,
   38,   35,   48,   45,   42,   39,   52,   49,   46,   43,   56,   53,   50,
-  47,   60,   57,   54,   51,   61,   58,   55,   62,   59,   63,    0,    4,
+  47,   60,   57,   54,   51,   61,   58,   55,   62,   59,   63,    0,    4, // 4x32
    1,    8,    5,    2,   12,    9,    6,    3,   16,   13,   10,    7,   20,
   17,   14,   11,   24,   21,   18,   15,   28,   25,   22,   19,   32,   29,
   26,   23,   36,   33,   30,   27,   40,   37,   34,   31,   44,   41,   38,
@@ -133,7 +133,7 @@ static const uint32_t const g_scan_order_buffer[32258] = {
   78,   75,   88,   85,   82,   79,   92,   89,   86,   83,   96,   93,   90,
   87,  100,   97,   94,   91,  104,  101,   98,   95,  108,  105,  102,   99,
  112,  109,  106,  103,  116,  113,  110,  107,  120,  117,  114,  111,  124,
- 121,  118,  115,  125,  122,  119,  126,  123,  127,    0,    4,    1,    8,
+ 121,  118,  115,  125,  122,  119,  126,  123,  127,    0,    4,    1,    8, // 4x64
    5,    2,   12,    9,    6,    3,   16,   13,   10,    7,   20,   17,   14,
   11,   24,   21,   18,   15,   28,   25,   22,   19,   32,   29,   26,   23,
   36,   33,   30,   27,   40,   37,   34,   31,   44,   41,   38,   35,   48,
@@ -154,15 +154,15 @@ static const uint32_t const g_scan_order_buffer[32258] = {
  219,  232,  229,  226,  223,  236,  233,  230,  227,  240,  237,  234,  231,
  244,  241,  238,  235,  248,  245,  242,  239,  252,  249,  246,  243,  253,
  250,  247,  254,  251,  255,    0,    1,    2,    3,    4,    5,    6,    7, // 8xN
-   0,    8,    1,    9,    2,   10,    3,   11,    4,   12,    5,   13,    6,
-  14,    7,   15,    0,    8,    1,   16,    9,    2,   24,   17,   10,    3,
+   0,    8,    1,    9,    2,   10,    3,   11,    4,   12,    5,   13,    6, // 8x2
+  14,    7,   15,    0,    8,    1,   16,    9,    2,   24,   17,   10,    3, // 8x4
   25,   18,   11,    4,   26,   19,   12,    5,   27,   20,   13,    6,   28,
-  21,   14,    7,   29,   22,   15,   30,   23,   31,    0,    8,    1,   16,
+  21,   14,    7,   29,   22,   15,   30,   23,   31,    0,    8,    1,   16, // 8x8
    9,    2,   24,   17,   10,    3,   32,   25,   18,   11,    4,   40,   33,
   26,   19,   12,    5,   48,   41,   34,   27,   20,   13,    6,   56,   49,
   42,   35,   28,   21,   14,    7,   57,   50,   43,   36,   29,   22,   15,
   58,   51,   44,   37,   30,   23,   59,   52,   45,   38,   31,   60,   53,
-  46,   39,   61,   54,   47,   62,   55,   63,    0,    8,    1,   16,    9,
+  46,   39,   61,   54,   47,   62,   55,   63,    0,    8,    1,   16,    9, // 8x16
    2,   24,   17,   10,    3,   32,   25,   18,   11,    4,   40,   33,   26,
   19,   12,    5,   48,   41,   34,   27,   20,   13,    6,   56,   49,   42,
   35,   28,   21,   14,    7,   64,   57,   50,   43,   36,   29,   22,   15,
@@ -172,7 +172,7 @@ static const uint32_t const g_scan_order_buffer[32258] = {
   55,  112,  105,   98,   91,   84,   77,   70,   63,  120,  113,  106,   99,
   92,   85,   78,   71,  121,  114,  107,  100,   93,   86,   79,  122,  115,
  108,  101,   94,   87,  123,  116,  109,  102,   95,  124,  117,  110,  103,
- 125,  118,  111,  126,  119,  127,    0,    8,    1,   16,    9,    2,   24,
+ 125,  118,  111,  126,  119,  127,    0,    8,    1,   16,    9,    2,   24, // 8x32
   17,   10,    3,   32,   25,   18,   11,    4,   40,   33,   26,   19,   12,
    5,   48,   41,   34,   27,   20,   13,    6,   56,   49,   42,   35,   28,
   21,   14,    7,   64,   57,   50,   43,   36,   29,   22,   15,   72,   65,
@@ -192,7 +192,7 @@ static const uint32_t const g_scan_order_buffer[32258] = {
  219,  212,  205,  198,  191,  248,  241,  234,  227,  220,  213,  206,  199,
  249,  242,  235,  228,  221,  214,  207,  250,  243,  236,  229,  222,  215,
  251,  244,  237,  230,  223,  252,  245,  238,  231,  253,  246,  239,  254,
- 247,  255,    0,    8,    1,   16,    9,    2,   24,   17,   10,    3,   32,
+ 247,  255,    0,    8,    1,   16,    9,    2,   24,   17,   10,    3,   32, // 8x64
   25,   18,   11,    4,   40,   33,   26,   19,   12,    5,   48,   41,   34,
   27,   20,   13,    6,   56,   49,   42,   35,   28,   21,   14,    7,   64,
   57,   50,   43,   36,   29,   22,   15,   72,   65,   58,   51,   44,   37,
@@ -232,15 +232,15 @@ static const uint32_t const g_scan_order_buffer[32258] = {
  483,  476,  469,  462,  455,  505,  498,  491,  484,  477,  470,  463,  506,
  499,  492,  485,  478,  471,  507,  500,  493,  486,  479,  508,  501,  494,
  487,  509,  502,  495,  510,  503,  511,    0,    1,    2,    3,    4,    5, // 16xN
-   6,    7,    8,    9,   10,   11,   12,   13,   14,   15,    0,   16,    1,
+   6,    7,    8,    9,   10,   11,   12,   13,   14,   15,    0,   16,    1, // 16x2
   17,    2,   18,    3,   19,    4,   20,    5,   21,    6,   22,    7,   23,
    8,   24,    9,   25,   10,   26,   11,   27,   12,   28,   13,   29,   14,
-  30,   15,   31,    0,   16,    1,   32,   17,    2,   48,   33,   18,    3,
+  30,   15,   31,    0,   16,    1,   32,   17,    2,   48,   33,   18,    3, // 16x4
   49,   34,   19,    4,   50,   35,   20,    5,   51,   36,   21,    6,   52,
   37,   22,    7,   53,   38,   23,    8,   54,   39,   24,    9,   55,   40,
   25,   10,   56,   41,   26,   11,   57,   42,   27,   12,   58,   43,   28,
   13,   59,   44,   29,   14,   60,   45,   30,   15,   61,   46,   31,   62,
-  47,   63,    0,   16,    1,   32,   17,    2,   48,   33,   18,    3,   64,
+  47,   63,    0,   16,    1,   32,   17,    2,   48,   33,   18,    3,   64, // 16x8
   49,   34,   19,    4,   80,   65,   50,   35,   20,    5,   96,   81,   66,
   51,   36,   21,    6,  112,   97,   82,   67,   52,   37,   22,    7,  113,
   98,   83,   68,   53,   38,   23,    8,  114,   99,   84,   69,   54,   39,
@@ -250,7 +250,7 @@ static const uint32_t const g_scan_order_buffer[32258] = {
   44,   29,   14,  120,  105,   90,   75,   60,   45,   30,   15,  121,  106,
   91,   76,   61,   46,   31,  122,  107,   92,   77,   62,   47,  123,  108,
   93,   78,   63,  124,  109,   94,   79,  125,  110,   95,  126,  111,  127,
-   0,   16,    1,   32,   17,    2,   48,   33,   18,    3,   64,   49,   34,
+   0,   16,    1,   32,   17,    2,   48,   33,   18,    3,   64,   49,   34, // 16x16
   19,    4,   80,   65,   50,   35,   20,    5,   96,   81,   66,   51,   36,
   21,    6,  112,   97,   82,   67,   52,   37,   22,    7,  128,  113,   98,
   83,   68,   53,   38,   23,    8,  144,  129,  114,   99,   84,   69,   54,
@@ -269,7 +269,7 @@ static const uint32_t const g_scan_order_buffer[32258] = {
  141,  126,  111,  247,  232,  217,  202,  187,  172,  157,  142,  127,  248,
  233,  218,  203,  188,  173,  158,  143,  249,  234,  219,  204,  189,  174,
  159,  250,  235,  220,  205,  190,  175,  251,  236,  221,  206,  191,  252,
- 237,  222,  207,  253,  238,  223,  254,  239,  255,    0,   16,    1,   32,
+ 237,  222,  207,  253,  238,  223,  254,  239,  255,    0,   16,    1,   32, // 16x32
   17,    2,   48,   33,   18,    3,   64,   49,   34,   19,    4,   80,   65,
   50,   35,   20,    5,   96,   81,   66,   51,   36,   21,    6,  112,   97,
   82,   67,   52,   37,   22,    7,  128,  113,   98,   83,   68,   53,   38,
@@ -309,7 +309,7 @@ static const uint32_t const g_scan_order_buffer[32258] = {
  428,  413,  398,  383,  504,  489,  474,  459,  444,  429,  414,  399,  505,
  490,  475,  460,  445,  430,  415,  506,  491,  476,  461,  446,  431,  507,
  492,  477,  462,  447,  508,  493,  478,  463,  509,  494,  479,  510,  495,
- 511,    0,   16,    1,   32,   17,    2,   48,   33,   18,    3,   64,   49,
+ 511,    0,   16,    1,   32,   17,    2,   48,   33,   18,    3,   64,   49, // 16x64
   34,   19,    4,   80,   65,   50,   35,   20,    5,   96,   81,   66,   51,
   36,   21,    6,  112,   97,   82,   67,   52,   37,   22,    7,  128,  113,
   98,   83,   68,   53,   38,   23,    8,  144,  129,  114,   99,   84,   69,
@@ -390,12 +390,12 @@ static const uint32_t const g_scan_order_buffer[32258] = {
  959, 1020, 1005,  990,  975, 1021, 1006,  991, 1022, 1007, 1023,    0,    1, // 32xN
    2,    3,    4,    5,    6,    7,    8,    9,   10,   11,   12,   13,   14,
   15,   16,   17,   18,   19,   20,   21,   22,   23,   24,   25,   26,   27,
-  28,   29,   30,   31,    0,   32,    1,   33,    2,   34,    3,   35,    4,
+  28,   29,   30,   31,    0,   32,    1,   33,    2,   34,    3,   35,    4, // 32x2
   36,    5,   37,    6,   38,    7,   39,    8,   40,    9,   41,   10,   42,
   11,   43,   12,   44,   13,   45,   14,   46,   15,   47,   16,   48,   17,
   49,   18,   50,   19,   51,   20,   52,   21,   53,   22,   54,   23,   55,
   24,   56,   25,   57,   26,   58,   27,   59,   28,   60,   29,   61,   30,
-  62,   31,   63,    0,   32,    1,   64,   33,    2,   96,   65,   34,    3,
+  62,   31,   63,    0,   32,    1,   64,   33,    2,   96,   65,   34,    3, // 32x4
   97,   66,   35,    4,   98,   67,   36,    5,   99,   68,   37,    6,  100,
   69,   38,    7,  101,   70,   39,    8,  102,   71,   40,    9,  103,   72,
   41,   10,  104,   73,   42,   11,  105,   74,   43,   12,  106,   75,   44,
@@ -405,7 +405,7 @@ static const uint32_t const g_scan_order_buffer[32258] = {
   54,   23,  117,   86,   55,   24,  118,   87,   56,   25,  119,   88,   57,
   26,  120,   89,   58,   27,  121,   90,   59,   28,  122,   91,   60,   29,
  123,   92,   61,   30,  124,   93,   62,   31,  125,   94,   63,  126,   95,
- 127,    0,   32,    1,   64,   33,    2,   96,   65,   34,    3,  128,   97,
+ 127,    0,   32,    1,   64,   33,    2,   96,   65,   34,    3,  128,   97, // 32x8
   66,   35,    4,  160,  129,   98,   67,   36,    5,  192,  161,  130,   99,
   68,   37,    6,  224,  193,  162,  131,  100,   69,   38,    7,  225,  194,
  163,  132,  101,   70,   39,    8,  226,  195,  164,  133,  102,   71,   40,
@@ -424,7 +424,7 @@ static const uint32_t const g_scan_order_buffer[32258] = {
  153,  122,   91,   60,   29,  247,  216,  185,  154,  123,   92,   61,   30,
  248,  217,  186,  155,  124,   93,   62,   31,  249,  218,  187,  156,  125,
   94,   63,  250,  219,  188,  157,  126,   95,  251,  220,  189,  158,  127,
- 252,  221,  190,  159,  253,  222,  191,  254,  223,  255,    0,   32,    1,
+ 252,  221,  190,  159,  253,  222,  191,  254,  223,  255,    0,   32,    1, // 32x16
   64,   33,    2,   96,   65,   34,    3,  128,   97,   66,   35,    4,  160,
  129,   98,   67,   36,    5,  192,  161,  130,   99,   68,   37,    6,  224,
  193,  162,  131,  100,   69,   38,    7,  256,  225,  194,  163,  132,  101,
@@ -464,7 +464,7 @@ static const uint32_t const g_scan_order_buffer[32258] = {
  379,  348,  317,  286,  255,  504,  473,  442,  411,  380,  349,  318,  287,
  505,  474,  443,  412,  381,  350,  319,  506,  475,  444,  413,  382,  351,
  507,  476,  445,  414,  383,  508,  477,  446,  415,  509,  478,  447,  510,
- 479,  511,    0,   32,    1,   64,   33,    2,   96,   65,   34,    3,  128,
+ 479,  511,    0,   32,    1,   64,   33,    2,   96,   65,   34,    3,  128, // 32x32
   97,   66,   35,    4,  160,  129,   98,   67,   36,    5,  192,  161,  130,
   99,   68,   37,    6,  224,  193,  162,  131,  100,   69,   38,    7,  256,
  225,  194,  163,  132,  101,   70,   39,    8,  288,  257,  226,  195,  164,
@@ -542,7 +542,7 @@ static const uint32_t const g_scan_order_buffer[32258] = {
  890,  859,  828,  797,  766,  735, 1015,  984,  953,  922,  891,  860,  829,
  798,  767, 1016,  985,  954,  923,  892,  861,  830,  799, 1017,  986,  955,
  924,  893,  862,  831, 1018,  987,  956,  925,  894,  863, 1019,  988,  957,
- 926,  895, 1020,  989,  958,  927, 1021,  990,  959, 1022,  991, 1023,    0,
+ 926,  895, 1020,  989,  958,  927, 1021,  990,  959, 1022,  991, 1023,    0, // 32x64
   32,    1,   64,   33,    2,   96,   65,   34,    3,  128,   97,   66,   35,
    4,  160,  129,   98,   67,   36,    5,  192,  161,  130,   99,   68,   37,
    6,  224,  193,  162,  131,  100,   69,   38,    7,  256,  225,  194,  163,
@@ -705,7 +705,7 @@ static const uint32_t const g_scan_order_buffer[32258] = {
   20,   21,   22,   23,   24,   25,   26,   27,   28,   29,   30,   31,   32,
   33,   34,   35,   36,   37,   38,   39,   40,   41,   42,   43,   44,   45,
   46,   47,   48,   49,   50,   51,   52,   53,   54,   55,   56,   57,   58,
-  59,   60,   61,   62,   63,    0,   64,    1,   65,    2,   66,    3,   67,
+  59,   60,   61,   62,   63,    0,   64,    1,   65,    2,   66,    3,   67, // 64x2
    4,   68,    5,   69,    6,   70,    7,   71,    8,   72,    9,   73,   10,
   74,   11,   75,   12,   76,   13,   77,   14,   78,   15,   79,   16,   80,
   17,   81,   18,   82,   19,   83,   20,   84,   21,   85,   22,   86,   23,
@@ -715,7 +715,7 @@ static const uint32_t const g_scan_order_buffer[32258] = {
   43,  107,   44,  108,   45,  109,   46,  110,   47,  111,   48,  112,   49,
  113,   50,  114,   51,  115,   52,  116,   53,  117,   54,  118,   55,  119,
   56,  120,   57,  121,   58,  122,   59,  123,   60,  124,   61,  125,   62,
- 126,   63,  127,    0,   64,    1,  128,   65,    2,  192,  129,   66,    3,
+ 126,   63,  127,    0,   64,    1,  128,   65,    2,  192,  129,   66,    3, // 64x4
  193,  130,   67,    4,  194,  131,   68,    5,  195,  132,   69,    6,  196,
  133,   70,    7,  197,  134,   71,    8,  198,  135,   72,    9,  199,  136,
   73,   10,  200,  137,   74,   11,  201,  138,   75,   12,  202,  139,   76,
@@ -734,7 +734,7 @@ static const uint32_t const g_scan_order_buffer[32258] = {
   52,  242,  179,  116,   53,  243,  180,  117,   54,  244,  181,  118,   55,
  245,  182,  119,   56,  246,  183,  120,   57,  247,  184,  121,   58,  248,
  185,  122,   59,  249,  186,  123,   60,  250,  187,  124,   61,  251,  188,
- 125,   62,  252,  189,  126,   63,  253,  190,  127,  254,  191,  255,    0,
+ 125,   62,  252,  189,  126,   63,  253,  190,  127,  254,  191,  255,    0, // 64x8
   64,    1,  128,   65,    2,  192,  129,   66,    3,  256,  193,  130,   67,
    4,  320,  257,  194,  131,   68,    5,  384,  321,  258,  195,  132,   69,
    6,  448,  385,  322,  259,  196,  133,   70,    7,  449,  386,  323,  260,
@@ -774,7 +774,7 @@ static const uint32_t const g_scan_order_buffer[32258] = {
  440,  377,  314,  251,  188,  125,   62,  504,  441,  378,  315,  252,  189,
  126,   63,  505,  442,  379,  316,  253,  190,  127,  506,  443,  380,  317,
  254,  191,  507,  444,  381,  318,  255,  508,  445,  382,  319,  509,  446,
- 383,  510,  447,  511,    0,   64,    1,  128,   65,    2,  192,  129,   66,
+ 383,  510,  447,  511,    0,   64,    1,  128,   65,    2,  192,  129,   66, // 64x16
    3,  256,  193,  130,   67,    4,  320,  257,  194,  131,   68,    5,  384,
  321,  258,  195,  132,   69,    6,  448,  385,  322,  259,  196,  133,   70,
    7,  512,  449,  386,  323,  260,  197,  134,   71,    8,  576,  513,  450,
@@ -853,7 +853,7 @@ static const uint32_t const g_scan_order_buffer[32258] = {
  700,  637,  574,  511, 1016,  953,  890,  827,  764,  701,  638,  575, 1017,
  954,  891,  828,  765,  702,  639, 1018,  955,  892,  829,  766,  703, 1019,
  956,  893,  830,  767, 1020,  957,  894,  831, 1021,  958,  895, 1022,  959,
-1023,    0,   64,    1,  128,   65,    2,  192,  129,   66,    3,  256,  193,
+1023,    0,   64,    1,  128,   65,    2,  192,  129,   66,    3,  256,  193, // 64x32
  130,   67,    4,  320,  257,  194,  131,   68,    5,  384,  321,  258,  195,
  132,   69,    6,  448,  385,  322,  259,  196,  133,   70,    7,  512,  449,
  386,  323,  260,  197,  134,   71,    8,  576,  513,  450,  387,  324,  261,
@@ -1010,7 +1010,7 @@ static const uint32_t const g_scan_order_buffer[32258] = {
 1534, 1471, 2039, 1976, 1913, 1850, 1787, 1724, 1661, 1598, 1535, 2040, 1977,
 1914, 1851, 1788, 1725, 1662, 1599, 2041, 1978, 1915, 1852, 1789, 1726, 1663,
 2042, 1979, 1916, 1853, 1790, 1727, 2043, 1980, 1917, 1854, 1791, 2044, 1981,
-1918, 1855, 2045, 1982, 1919, 2046, 1983, 2047,    0,   64,    1,  128,   65,
+1918, 1855, 2045, 1982, 1919, 2046, 1983, 2047,    0,   64,    1,  128,   65, // 64x64
    2,  192,  129,   66,    3,  256,  193,  130,   67,    4,  320,  257,  194,
  131,   68,    5,  384,  321,  258,  195,  132,   69,    6,  448,  385,  322,
  259,  196,  133,   70,    7,  512,  449,  386,  323,  260,  197,  134,   71,
@@ -1325,27 +1325,27 @@ static const uint32_t const g_scan_order_buffer[32258] = {
 3645, 3582, 3519, 4087, 4024, 3961, 3898, 3835, 3772, 3709, 3646, 3583, 4088,
 4025, 3962, 3899, 3836, 3773, 3710, 3647, 4089, 4026, 3963, 3900, 3837, 3774,
 3711, 4090, 4027, 3964, 3901, 3838, 3775, 4091, 4028, 3965, 3902, 3839, 4092,
-4029, 3966, 3903, 4093, 4030, 3967, 4094, 4031, 4095,    0,    0,    1,    0, // 1xN, coef groups
-   1,    2,    3,    0,    1,    2,    3,    4,    5,    6,    7,    0,    1,
+4029, 3966, 3903, 4093, 4030, 3967, 4094, 4031, 4095,    0,    0,    1,    0, // 4x4 GROUPED 1xN, 1x2, 1x4
+   1,    2,    3,    0,    1,    2,    3,    4,    5,    6,    7,    0,    1, // 1x8, 1x16
    2,    3,    4,    5,    6,    7,    8,    9,   10,   11,   12,   13,   14,
-  15,    0,    1,    2,    3,    4,    5,    6,    7,    8,    9,   10,   11,
+  15,    0,    1,    2,    3,    4,    5,    6,    7,    8,    9,   10,   11, // 1x32
   12,   13,   14,   15,   16,   17,   18,   19,   20,   21,   22,   23,   24,
-  25,   26,   27,   28,   29,   30,   31,    0,    1,    2,    3,    4,    5,
+  25,   26,   27,   28,   29,   30,   31,    0,    1,    2,    3,    4,    5, // 1x64
    6,    7,    8,    9,   10,   11,   12,   13,   14,   15,   16,   17,   18,
   19,   20,   21,   22,   23,   24,   25,   26,   27,   28,   29,   30,   31,
   63,   63,   63,   63,   63,   63,   63,   63,   63,   63,   63,   63,   63,
   63,   63,   63,   63,   63,   63,   63,   63,   63,   63,   63,   63,   63,
-  63,   63,   63,   63,   63,   63,    0,    1,    0,    2,    1,    3,    0, // 2xN
-   2,    1,    3,    4,    6,    5,    7,    0,    2,    1,    4,    3,    6,
-   5,    8,    7,   10,    9,   12,   11,   14,   13,   15,    0,    2,    1,
+  63,   63,   63,   63,   63,   63,    0,    1,    0,    2,    1,    3,    0, // 2xN, 2x2, 2x4
+   2,    1,    3,    4,    6,    5,    7,    0,    2,    1,    4,    3,    6, // 2x8
+   5,    8,    7,   10,    9,   12,   11,   14,   13,   15,    0,    2,    1, // 2x16
    4,    3,    6,    5,    8,    7,   10,    9,   12,   11,   14,   13,   15,
   16,   18,   17,   20,   19,   22,   21,   24,   23,   26,   25,   28,   27,
-  30,   29,   31,    0,    2,    1,    4,    3,    6,    5,    8,    7,   10,
+  30,   29,   31,    0,    2,    1,    4,    3,    6,    5,    8,    7,   10, // 2x32
    9,   12,   11,   14,   13,   15,   16,   18,   17,   20,   19,   22,   21,
   24,   23,   26,   25,   28,   27,   30,   29,   31,   32,   34,   33,   36,
   35,   38,   37,   40,   39,   42,   41,   44,   43,   46,   45,   47,   48,
   50,   49,   52,   51,   54,   53,   56,   55,   58,   57,   60,   59,   62,
-  61,   63,    0,    2,    1,    4,    3,    6,    5,    8,    7,   10,    9,
+  61,   63,    0,    2,    1,    4,    3,    6,    5,    8,    7,   10,    9, // 2x64
   12,   11,   14,   13,   15,   16,   18,   17,   20,   19,   22,   21,   24,
   23,   26,   25,   28,   27,   30,   29,   31,   32,   34,   33,   36,   35,
   38,   37,   40,   39,   42,   41,   44,   43,   46,   45,   47,   48,   50,
@@ -1355,16 +1355,16 @@ static const uint32_t const g_scan_order_buffer[32258] = {
  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,
  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,
  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,
-   0,    1,    2,    3,    0,    4,    1,    5,    2,    6,    3,    7,    0, // 4xN
+   0,    1,    2,    3,    0,    4,    1,    5,    2,    6,    3,    7,    0, // 4xN, 4x2, 4x4
    4,    1,    8,    5,    2,   12,    9,    6,    3,   13,   10,    7,   14,
-  11,   15,    0,    4,    1,    8,    5,    2,   12,    9,    6,    3,   13,
+  11,   15,    0,    4,    1,    8,    5,    2,   12,    9,    6,    3,   13, // 4x8
   10,    7,   14,   11,   15,   16,   20,   17,   24,   21,   18,   28,   25,
-  22,   19,   29,   26,   23,   30,   27,   31,    0,    4,    1,    8,    5,
+  22,   19,   29,   26,   23,   30,   27,   31,    0,    4,    1,    8,    5, // 4x16
    2,   12,    9,    6,    3,   13,   10,    7,   14,   11,   15,   16,   20,
   17,   24,   21,   18,   28,   25,   22,   19,   29,   26,   23,   30,   27,
   31,   32,   36,   33,   40,   37,   34,   44,   41,   38,   35,   45,   42,
   39,   46,   43,   47,   48,   52,   49,   56,   53,   50,   60,   57,   54,
-  51,   61,   58,   55,   62,   59,   63,    0,    4,    1,    8,    5,    2,
+  51,   61,   58,   55,   62,   59,   63,    0,    4,    1,    8,    5,    2, // 4x32
   12,    9,    6,    3,   13,   10,    7,   14,   11,   15,   16,   20,   17,
   24,   21,   18,   28,   25,   22,   19,   29,   26,   23,   30,   27,   31,
   32,   36,   33,   40,   37,   34,   44,   41,   38,   35,   45,   42,   39,
@@ -1374,7 +1374,7 @@ static const uint32_t const g_scan_order_buffer[32258] = {
   85,   82,   92,   89,   86,   83,   93,   90,   87,   94,   91,   95,   96,
  100,   97,  104,  101,   98,  108,  105,  102,   99,  109,  106,  103,  110,
  107,  111,  112,  116,  113,  120,  117,  114,  124,  121,  118,  115,  125,
- 122,  119,  126,  123,  127,    0,    4,    1,    8,    5,    2,   12,    9,
+ 122,  119,  126,  123,  127,    0,    4,    1,    8,    5,    2,   12,    9, // 4x64
    6,    3,   13,   10,    7,   14,   11,   15,   16,   20,   17,   24,   21,
   18,   28,   25,   22,   19,   29,   26,   23,   30,   27,   31,   32,   36,
   33,   40,   37,   34,   44,   41,   38,   35,   45,   42,   39,   46,   43,
@@ -1394,16 +1394,16 @@ static const uint32_t const g_scan_order_buffer[32258] = {
  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,
  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,
  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,
- 255,    0,    1,    2,    3,    4,    5,    6,    7,    0,    8,    1,    9, // 8xN
-   2,   10,    3,   11,    4,   12,    5,   13,    6,   14,    7,   15,    0,
+ 255,    0,    1,    2,    3,    4,    5,    6,    7,    0,    8,    1,    9, // 8xN, 8x2
+   2,   10,    3,   11,    4,   12,    5,   13,    6,   14,    7,   15,    0, // 8x4
    8,    1,   16,    9,    2,   24,   17,   10,    3,   25,   18,   11,   26,
   19,   27,    4,   12,    5,   20,   13,    6,   28,   21,   14,    7,   29,
-  22,   15,   30,   23,   31,    0,    8,    1,   16,    9,    2,   24,   17,
+  22,   15,   30,   23,   31,    0,    8,    1,   16,    9,    2,   24,   17, // 8x8
   10,    3,   25,   18,   11,   26,   19,   27,   32,   40,   33,   48,   41,
   34,   56,   49,   42,   35,   57,   50,   43,   58,   51,   59,    4,   12,
    5,   20,   13,    6,   28,   21,   14,    7,   29,   22,   15,   30,   23,
   31,   36,   44,   37,   52,   45,   38,   60,   53,   46,   39,   61,   54,
-  47,   62,   55,   63,    0,    8,    1,   16,    9,    2,   24,   17,   10,
+  47,   62,   55,   63,    0,    8,    1,   16,    9,    2,   24,   17,   10, // 8x16
    3,   25,   18,   11,   26,   19,   27,   32,   40,   33,   48,   41,   34,
   56,   49,   42,   35,   57,   50,   43,   58,   51,   59,    4,   12,    5,
   20,   13,    6,   28,   21,   14,    7,   29,   22,   15,   30,   23,   31,
@@ -1413,7 +1413,7 @@ static const uint32_t const g_scan_order_buffer[32258] = {
  113,  106,   99,  121,  114,  107,  122,  115,  123,   68,   76,   69,   84,
   77,   70,   92,   85,   78,   71,   93,   86,   79,   94,   87,   95,  100,
  108,  101,  116,  109,  102,  124,  117,  110,  103,  125,  118,  111,  126,
- 119,  127,    0,    8,    1,   16,    9,    2,   24,   17,   10,    3,   25,
+ 119,  127,    0,    8,    1,   16,    9,    2,   24,   17,   10,    3,   25, // 8x32
   18,   11,   26,   19,   27,   32,   40,   33,   48,   41,   34,   56,   49,
   42,   35,   57,   50,   43,   58,   51,   59,    4,   12,    5,   20,   13,
    6,   28,   21,   14,    7,   29,   22,   15,   30,   23,   31,   64,   72,
@@ -1432,7 +1432,7 @@ static const uint32_t const g_scan_order_buffer[32258] = {
  183,  191,  224,  232,  225,  240,  233,  226,  248,  241,  234,  227,  249,
  242,  235,  250,  243,  251,  196,  204,  197,  212,  205,  198,  220,  213,
  206,  199,  221,  214,  207,  222,  215,  223,  228,  236,  229,  244,  237,
- 230,  252,  245,  238,  231,  253,  246,  239,  254,  247,  255,    0,    8,
+ 230,  252,  245,  238,  231,  253,  246,  239,  254,  247,  255,    0,    8, // 8x64
    1,   16,    9,    2,   24,   17,   10,    3,   25,   18,   11,   26,   19,
   27,   32,   40,   33,   48,   41,   34,   56,   49,   42,   35,   57,   50,
   43,   58,   51,   59,    4,   12,    5,   20,   13,    6,   28,   21,   14,
@@ -1473,14 +1473,14 @@ static const uint32_t const g_scan_order_buffer[32258] = {
  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
  511,  511,  511,    0,    1,    2,    3,    4,    5,    6,    7,    8,    9, // 16xN
-  10,   11,   12,   13,   14,   15,    0,   16,    1,   17,    2,   18,    3,
+  10,   11,   12,   13,   14,   15,    0,   16,    1,   17,    2,   18,    3, // 16x2
   19,    4,   20,    5,   21,    6,   22,    7,   23,    8,   24,    9,   25,
-  10,   26,   11,   27,   12,   28,   13,   29,   14,   30,   15,   31,    0,
+  10,   26,   11,   27,   12,   28,   13,   29,   14,   30,   15,   31,    0, // 16x4
   16,    1,   32,   17,    2,   48,   33,   18,    3,   49,   34,   19,   50,
   35,   51,    4,   20,    5,   36,   21,    6,   52,   37,   22,    7,   53,
   38,   23,   54,   39,   55,    8,   24,    9,   40,   25,   10,   56,   41,
   26,   11,   57,   42,   27,   58,   43,   59,   12,   28,   13,   44,   29,
-  14,   60,   45,   30,   15,   61,   46,   31,   62,   47,   63,    0,   16,
+  14,   60,   45,   30,   15,   61,   46,   31,   62,   47,   63,    0,   16, // 16x8
    1,   32,   17,    2,   48,   33,   18,    3,   49,   34,   19,   50,   35,
   51,   64,   80,   65,   96,   81,   66,  112,   97,   82,   67,  113,   98,
   83,  114,   99,  115,    4,   20,    5,   36,   21,    6,   52,   37,   22,
@@ -1490,7 +1490,7 @@ static const uint32_t const g_scan_order_buffer[32258] = {
   72,   88,   73,  104,   89,   74,  120,  105,   90,   75,  121,  106,   91,
  122,  107,  123,   12,   28,   13,   44,   29,   14,   60,   45,   30,   15,
   61,   46,   31,   62,   47,   63,   76,   92,   77,  108,   93,   78,  124,
- 109,   94,   79,  125,  110,   95,  126,  111,  127,    0,   16,    1,   32,
+ 109,   94,   79,  125,  110,   95,  126,  111,  127,    0,   16,    1,   32, // 16x16
   17,    2,   48,   33,   18,    3,   49,   34,   19,   50,   35,   51,   64,
   80,   65,   96,   81,   66,  112,   97,   82,   67,  113,   98,   83,  114,
   99,  115,    4,   20,    5,   36,   21,    6,   52,   37,   22,    7,   53,
@@ -1510,7 +1510,7 @@ static const uint32_t const g_scan_order_buffer[32258] = {
  217,  202,  248,  233,  218,  203,  249,  234,  219,  250,  235,  251,  140,
  156,  141,  172,  157,  142,  188,  173,  158,  143,  189,  174,  159,  190,
  175,  191,  204,  220,  205,  236,  221,  206,  252,  237,  222,  207,  253,
- 238,  223,  254,  239,  255,    0,   16,    1,   32,   17,    2,   48,   33,
+ 238,  223,  254,  239,  255,    0,   16,    1,   32,   17,    2,   48,   33, // 16x32
   18,    3,   49,   34,   19,   50,   35,   51,   64,   80,   65,   96,   81,
   66,  112,   97,   82,   67,  113,   98,   83,  114,   99,  115,    4,   20,
    5,   36,   21,    6,   52,   37,   22,    7,   53,   38,   23,   54,   39,
@@ -1549,7 +1549,7 @@ static const uint32_t const g_scan_order_buffer[32258] = {
  383,  456,  472,  457,  488,  473,  458,  504,  489,  474,  459,  505,  490,
  475,  506,  491,  507,  396,  412,  397,  428,  413,  398,  444,  429,  414,
  399,  445,  430,  415,  446,  431,  447,  460,  476,  461,  492,  477,  462,
- 508,  493,  478,  463,  509,  494,  479,  510,  495,  511,    0,   16,    1,
+ 508,  493,  478,  463,  509,  494,  479,  510,  495,  511,    0,   16,    1, // 16x64
   32,   17,    2,   48,   33,   18,    3,   49,   34,   19,   50,   35,   51,
   64,   80,   65,   96,   81,   66,  112,   97,   82,   67,  113,   98,   83,
  114,   99,  115,    4,   20,    5,   36,   21,    6,   52,   37,   22,    7,
@@ -1631,11 +1631,11 @@ static const uint32_t const g_scan_order_buffer[32258] = {
 1023, 1023, 1023, 1023, 1023, 1023, 1023,    0,    1,    2,    3,    4,    5, // 32xN
    6,    7,    8,    9,   10,   11,   12,   13,   14,   15,   16,   17,   18,
   19,   20,   21,   22,   23,   24,   25,   26,   27,   28,   29,   30,   31,
-   0,   32,    1,   33,    2,   34,    3,   35,    4,   36,    5,   37,    6,
+   0,   32,    1,   33,    2,   34,    3,   35,    4,   36,    5,   37,    6, // 32x2
   38,    7,   39,    8,   40,    9,   41,   10,   42,   11,   43,   12,   44,
   13,   45,   14,   46,   15,   47,   16,   48,   17,   49,   18,   50,   19,
   51,   20,   52,   21,   53,   22,   54,   23,   55,   24,   56,   25,   57,
-  26,   58,   27,   59,   28,   60,   29,   61,   30,   62,   31,   63,    0,
+  26,   58,   27,   59,   28,   60,   29,   61,   30,   62,   31,   63,    0, // 32x4
   32,    1,   64,   33,    2,   96,   65,   34,    3,   97,   66,   35,   98,
   67,   99,    4,   36,    5,   68,   37,    6,  100,   69,   38,    7,  101,
   70,   39,  102,   71,  103,    8,   40,    9,   72,   41,   10,  104,   73,
@@ -1645,7 +1645,7 @@ static const uint32_t const g_scan_order_buffer[32258] = {
  115,   20,   52,   21,   84,   53,   22,  116,   85,   54,   23,  117,   86,
   55,  118,   87,  119,   24,   56,   25,   88,   57,   26,  120,   89,   58,
   27,  121,   90,   59,  122,   91,  123,   28,   60,   29,   92,   61,   30,
- 124,   93,   62,   31,  125,   94,   63,  126,   95,  127,    0,   32,    1,
+ 124,   93,   62,   31,  125,   94,   63,  126,   95,  127,    0,   32,    1, // 32x8
   64,   33,    2,   96,   65,   34,    3,   97,   66,   35,   98,   67,   99,
  128,  160,  129,  192,  161,  130,  224,  193,  162,  131,  225,  194,  163,
  226,  195,  227,    4,   36,    5,   68,   37,    6,  100,   69,   38,    7,
@@ -1665,7 +1665,7 @@ static const uint32_t const g_scan_order_buffer[32258] = {
  216,  185,  154,  248,  217,  186,  155,  249,  218,  187,  250,  219,  251,
   28,   60,   29,   92,   61,   30,  124,   93,   62,   31,  125,   94,   63,
  126,   95,  127,  156,  188,  157,  220,  189,  158,  252,  221,  190,  159,
- 253,  222,  191,  254,  223,  255,    0,   32,    1,   64,   33,    2,   96,
+ 253,  222,  191,  254,  223,  255,    0,   32,    1,   64,   33,    2,   96, // 32x16
   65,   34,    3,   97,   66,   35,   98,   67,   99,  128,  160,  129,  192,
  161,  130,  224,  193,  162,  131,  225,  194,  163,  226,  195,  227,    4,
   36,    5,   68,   37,    6,  100,   69,   38,    7,  101,   70,   39,  102,
@@ -1704,7 +1704,7 @@ static const uint32_t const g_scan_order_buffer[32258] = {
  223,  255,  408,  440,  409,  472,  441,  410,  504,  473,  442,  411,  505,
  474,  443,  506,  475,  507,  284,  316,  285,  348,  317,  286,  380,  349,
  318,  287,  381,  350,  319,  382,  351,  383,  412,  444,  413,  476,  445,
- 414,  508,  477,  446,  415,  509,  478,  447,  510,  479,  511,    0,   32,
+ 414,  508,  477,  446,  415,  509,  478,  447,  510,  479,  511,    0,   32, // 32x32
    1,   64,   33,    2,   96,   65,   34,    3,   97,   66,   35,   98,   67,
   99,  128,  160,  129,  192,  161,  130,  224,  193,  162,  131,  225,  194,
  163,  226,  195,  227,    4,   36,    5,   68,   37,    6,  100,   69,   38,
@@ -1783,7 +1783,7 @@ static const uint32_t const g_scan_order_buffer[32258] = {
  952,  921,  984,  953,  922, 1016,  985,  954,  923, 1017,  986,  955, 1018,
  987, 1019,  796,  828,  797,  860,  829,  798,  892,  861,  830,  799,  893,
  862,  831,  894,  863,  895,  924,  956,  925,  988,  957,  926, 1020,  989,
- 958,  927, 1021,  990,  959, 1022,  991, 1023,    0,   32,    1,   64,   33,
+ 958,  927, 1021,  990,  959, 1022,  991, 1023,    0,   32,    1,   64,   33, // 32x64
    2,   96,   65,   34,    3,   97,   66,   35,   98,   67,   99,  128,  160,
  129,  192,  161,  130,  224,  193,  162,  131,  225,  194,  163,  226,  195,
  227,    4,   36,    5,   68,   37,    6,  100,   69,   38,    7,  101,   70,
@@ -1946,7 +1946,7 @@ static const uint32_t const g_scan_order_buffer[32258] = {
   24,   25,   26,   27,   28,   29,   30,   31,   63,   63,   63,   63,   63,
   63,   63,   63,   63,   63,   63,   63,   63,   63,   63,   63,   63,   63,
   63,   63,   63,   63,   63,   63,   63,   63,   63,   63,   63,   63,   63,
-  63,    0,   64,    1,   65,    2,   66,    3,   67,    4,   68,    5,   69,
+  63,    0,   64,    1,   65,    2,   66,    3,   67,    4,   68,    5,   69, // 64x2
    6,   70,    7,   71,    8,   72,    9,   73,   10,   74,   11,   75,   12,
   76,   13,   77,   14,   78,   15,   79,   16,   80,   17,   81,   18,   82,
   19,   83,   20,   84,   21,   85,   22,   86,   23,   87,   24,   88,   25,
@@ -1955,7 +1955,7 @@ static const uint32_t const g_scan_order_buffer[32258] = {
  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,
  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,
  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,
- 127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,    0,
+ 127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,    0, // 64x4
   64,    1,  128,   65,    2,  192,  129,   66,    3,  193,  130,   67,  194,
  131,  195,    4,   68,    5,  132,   69,    6,  196,  133,   70,    7,  197,
  134,   71,  198,  135,  199,    8,   72,    9,  136,   73,   10,  200,  137,
@@ -1975,7 +1975,7 @@ static const uint32_t const g_scan_order_buffer[32258] = {
  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,
  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,
  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,
- 255,  255,  255,  255,  255,  255,  255,  255,    0,   64,    1,  128,   65,
+ 255,  255,  255,  255,  255,  255,  255,  255,    0,   64,    1,  128,   65, // 64x8
    2,  192,  129,   66,    3,  193,  130,   67,  194,  131,  195,  256,  320,
  257,  384,  321,  258,  448,  385,  322,  259,  449,  386,  323,  450,  387,
  451,    4,   68,    5,  132,   69,    6,  196,  133,   70,    7,  197,  134,
@@ -2015,7 +2015,7 @@ static const uint32_t const g_scan_order_buffer[32258] = {
  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
-   0,   64,    1,  128,   65,    2,  192,  129,   66,    3,  193,  130,   67,
+   0,   64,    1,  128,   65,    2,  192,  129,   66,    3,  193,  130,   67, // 64x16
  194,  131,  195,  256,  320,  257,  384,  321,  258,  448,  385,  322,  259,
  449,  386,  323,  450,  387,  451,    4,   68,    5,  132,   69,    6,  196,
  133,   70,    7,  197,  134,   71,  198,  135,  199,  512,  576,  513,  640,
@@ -2093,7 +2093,7 @@ static const uint32_t const g_scan_order_buffer[32258] = {
 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
-1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,    0,   64,    1,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,    0,   64,    1, // 64x32
  128,   65,    2,  192,  129,   66,    3,  193,  130,   67,  194,  131,  195,
  256,  320,  257,  384,  321,  258,  448,  385,  322,  259,  449,  386,  323,
  450,  387,  451,    4,   68,    5,  132,   69,    6,  196,  133,   70,    7,
@@ -2251,7 +2251,7 @@ static const uint32_t const g_scan_order_buffer[32258] = {
 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
-2047, 2047, 2047, 2047,    0,   64,    1,  128,   65,    2,  192,  129,   66,
+2047, 2047, 2047, 2047,    0,   64,    1,  128,   65,    2,  192,  129,   66, // 64x64
    3,  193,  130,   67,  194,  131,  195,  256,  320,  257,  384,  321,  258,
  448,  385,  322,  259,  449,  386,  323,  450,  387,  451,    4,   68,    5,
  132,   69,    6,  196,  133,   70,    7,  197,  134,   71,  198,  135,  199,
@@ -2569,7 +2569,7 @@ static const uint32_t const g_scan_order_buffer[32258] = {
 4095, 4095, 4095, 4095, 4095,
 };
 
-// Get scan order table based on scan group index (diagonal or coef group)
+// Get scan order table based on scan group type (grouped or ungrouped)
 // and log2 block width and height index
 static const uint32_t* g_scan_order[SCAN_GROUP_TYPES][MAX_LOG2_INDEX][MAX_LOG2_INDEX] =
 {
@@ -2609,11 +2609,11 @@ uint32_t* uvg_get_scan_order_table(int scan_group, int scan_type, int log2_w, in
   // ISP_TODO: horizontal and vertical scan types
   assert(scan_type == SCAN_DIAG && "Horizontal and vertical scan not implemented.");
 
-  if (scan_group == SCAN_GROUP_NORM) {
+  if (scan_group == SCAN_GROUP_4X4) {
     return g_scan_order[scan_group][log2_w][log2_h];
   }
   else {
     // ISP_TODO: returning coef group type does not work yet. It will break for non-square blocks
-    return g_scan_order[scan_group][log2_w >> 2][log2_h >> 2];
+    return g_scan_order[scan_group][log2_w - 2][log2_h - 2];
   }
 }
diff --git a/src/tables.h b/src/tables.h
index 29914374..66b4a2c0 100644
--- a/src/tables.h
+++ b/src/tables.h
@@ -139,8 +139,8 @@ extern const uint32_t uvg_g_log2_sbb_size[7 + 1][7 + 1][2];
 #define SCAN_GROUP_TYPES 2
 #define MAX_LOG2_INDEX 7
 
-#define SCAN_GROUP_NORM 0
-#define SCAN_GROUP_COEF 1
+#define SCAN_GROUP_UNGROUPED 0
+#define SCAN_GROUP_4X4 1
 
 uint32_t* uvg_get_scan_order_table(int scan_group, int scan_type, int log2_w, int log2_h);
 

From 6f756e831d4ad8fd3e1c386ee6030873ad898720 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Wed, 10 Aug 2022 19:38:47 +0300
Subject: [PATCH 016/254] [isp] Uncomment old scan order code to test against
 new one. Add assert to ensure old and new tables are the same.

---
 src/encode_coding_tree.c                      | 11 +++++---
 src/rdo.c                                     | 25 +++++++++++++------
 src/search_intra.c                            | 11 +++++---
 .../generic/encode_coding_tree-generic.c      | 11 +++++---
 4 files changed, 38 insertions(+), 20 deletions(-)

diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index 6816ab26..2469904f 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -286,11 +286,11 @@ void uvg_encode_ts_residual(encoder_state_t* const state,
   const uint32_t log2_block_width = uvg_g_convert_to_bit[width] + 2;
   const uint32_t log2_block_height = log2_block_width; // ISP_TODO: height
   const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_width][log2_block_width][0] + uvg_g_log2_sbb_size[log2_block_width][log2_block_width][1];
-  //const uint32_t* scan =    uvg_g_sig_last_scan[scan_mode][log2_block_size - 1];
-  //const uint32_t* scan_cg = g_sig_last_scan_cg[log2_block_size - 1][scan_mode];
+  const uint32_t* old_scan =    uvg_g_sig_last_scan[scan_mode][log2_block_width - 1];
+  const uint32_t* old_scan_cg = g_sig_last_scan_cg[log2_block_width - 1][scan_mode];
 
-  const uint32_t* scan = uvg_get_scan_order_table(SCAN_GROUP_NORM, scan_mode, log2_block_width, log2_block_height);
-  const uint32_t* scan_cg = uvg_get_scan_order_table(SCAN_GROUP_COEF, scan_mode, log2_block_width, log2_block_height);
+  const uint32_t* scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_mode, log2_block_width, log2_block_height);
+  const uint32_t* scan_cg = uvg_get_scan_order_table(SCAN_GROUP_UNGROUPED, scan_mode, log2_block_width, log2_block_height);
 
   double bits = 0;
 
@@ -307,6 +307,9 @@ void uvg_encode_ts_residual(encoder_state_t* const state,
   // ISP_TODO: height
   for (i = 0; i < width * width; i++) {
     if (coeff[scan[i]]) {
+      // ISP_DEBUG
+      assert(old_scan[i] == scan[i] && "Old scan_cg differs from the new one.");
+      assert(old_scan_cg[i >> log2_cg_size] == scan_cg[i >> log2_cg_size] && "Old scan_cg differs from the new one.");
       //scan_pos_last = i;
       sig_coeffgroup_flag[scan_cg[i >> log2_cg_size]] = 1;
     }
diff --git a/src/rdo.c b/src/rdo.c
index a9202150..dfb2f06f 100644
--- a/src/rdo.c
+++ b/src/rdo.c
@@ -1183,10 +1183,10 @@ int uvg_ts_rdoq(encoder_state_t* const state, coeff_t* src_coeff, coeff_t* dest_
 
   const coeff_t entropy_coding_maximum = (1 << max_log2_tr_dynamic_range) - 1;
 
-  //const uint32_t* scan = uvg_g_sig_last_scan[scan_mode][log2_block_size - 1];
-  //const uint32_t* scan_cg = g_sig_last_scan_cg[log2_block_size - 1][scan_mode];
-  const uint32_t* scan = uvg_get_scan_order_table(SCAN_GROUP_NORM, scan_mode, log2_block_width, log2_block_height);
-  const uint32_t* scan_cg = uvg_get_scan_order_table(SCAN_GROUP_COEF, scan_mode, log2_block_width, log2_block_height);
+  const uint32_t* old_scan = uvg_g_sig_last_scan[scan_mode][log2_block_width - 1];
+  const uint32_t* old_scan_cg = g_sig_last_scan_cg[log2_block_width - 1][scan_mode];
+  const uint32_t* scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_mode, log2_block_width, log2_block_height);
+  const uint32_t* scan_cg = uvg_get_scan_order_table(SCAN_GROUP_UNGROUPED, scan_mode, log2_block_width, log2_block_height);
 
   uint32_t coeff_levels[3];
   double   coeff_level_error[4];
@@ -1212,6 +1212,9 @@ int uvg_ts_rdoq(encoder_state_t* const state, coeff_t* src_coeff, coeff_t* dest_
   for (uint32_t sbId = 0; sbId < cg_num; sbId++)
   {
     uint32_t cg_blkpos = scan_cg[sbId];
+    // ISP_DEBUG
+    assert(old_scan[sbId] == scan[sbId] && "Old scan_cg differs from the new one.");
+    assert(old_scan_cg[sbId] == scan_cg[sbId] && "Old scan_cg differs from the new one.");
 
     int no_coeff_coded = 0;
     base_cost = 0.0;
@@ -1424,8 +1427,12 @@ void uvg_rdoq(
 
   const uint32_t cg_width = (MIN((uint8_t)32, width) >> (log2_cg_size / 2));
 
-  //const uint32_t *scan_cg = g_sig_last_scan_cg[log2_block_size - 1][scan_mode];
-  const uint32_t *scan_cg = uvg_get_scan_order_table(SCAN_GROUP_COEF, scan_mode, log2_block_width, log2_block_height);
+  const uint32_t *old_scan = uvg_g_sig_last_scan[ scan_mode ][ log2_block_width - 1 ];
+  const uint32_t *old_scan_cg = g_sig_last_scan_cg[log2_block_width - 1][scan_mode];
+  
+  const uint32_t *scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_mode, log2_block_width, log2_block_height);
+  const uint32_t *scan_cg = uvg_get_scan_order_table(SCAN_GROUP_UNGROUPED, scan_mode, log2_block_width, log2_block_height);
+
   const uint32_t cg_size = 16;
   const int32_t  shift = 4 >> 1;
   const uint32_t num_blk_side = width >> shift;
@@ -1437,8 +1444,7 @@ void uvg_rdoq(
   int32_t temp_diag = -1;
   int32_t temp_sum = -1;
 
-  //const uint32_t *scan = uvg_g_sig_last_scan[ scan_mode ][ log2_block_size - 1 ];
-  const uint32_t *scan = uvg_get_scan_order_table(SCAN_GROUP_NORM, scan_mode, log2_block_width, log2_block_height);
+  
   
   int32_t cg_last_scanpos = -1;
   int32_t last_scanpos = -1;
@@ -1475,6 +1481,8 @@ void uvg_rdoq(
     for (int32_t scanpos_in_cg = (cg_size - 1); scanpos_in_cg >= 0; scanpos_in_cg--)
     {
       int32_t  scanpos        = cg_scanpos*cg_size + scanpos_in_cg;
+      assert(old_scan[scanpos] == scan[scanpos] && "Scan index differs from old system.");
+
       if (lfnst_idx > 0 && scanpos > max_lfnst_pos) break;
       uint32_t blkpos         = scan[scanpos];
       int32_t q               = quant_coeff[blkpos];
@@ -1509,6 +1517,7 @@ void uvg_rdoq(
   int32_t last_x_bits[32], last_y_bits[32];
 
   for (int32_t cg_scanpos = cg_last_scanpos; cg_scanpos >= 0; cg_scanpos--) {
+    assert(old_scan_cg[cg_scanpos] == scan_cg[cg_scanpos] && "Scan cg index differs from old system.");
     uint32_t cg_blkpos  = scan_cg[cg_scanpos];
     uint32_t cg_pos_y   = cg_blkpos / num_blk_side;
     uint32_t cg_pos_x   = cg_blkpos - (cg_pos_y * num_blk_side);
diff --git a/src/search_intra.c b/src/search_intra.c
index e5b23d7d..5b348b0e 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -204,10 +204,10 @@ static void derive_mts_constraints(cu_info_t *const pred_cu,
   const uint32_t log2_block_height = uvg_g_convert_to_bit[height] + 2;
   const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_width][log2_block_width][0]
     + uvg_g_log2_sbb_size[log2_block_width][log2_block_width][1]; // ISP_TODO: height
-  //const uint32_t *scan = uvg_g_sig_last_scan[scan_idx][log2_block_size - 1];
-  //const uint32_t *scan_cg = g_sig_last_scan_cg[log2_block_size - 1][scan_idx];
-  const uint32_t *scan = uvg_get_scan_order_table(SCAN_GROUP_NORM, scan_idx, log2_block_width, log2_block_height);
-  const uint32_t *scan_cg = uvg_get_scan_order_table(SCAN_GROUP_COEF, scan_idx, log2_block_width, log2_block_height);
+  const uint32_t *old_scan = uvg_g_sig_last_scan[scan_idx][log2_block_width - 1];
+  const uint32_t *old_scan_cg = g_sig_last_scan_cg[log2_block_width - 1][scan_idx];
+  const uint32_t *scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_idx, log2_block_width, log2_block_height);
+  const uint32_t *scan_cg = uvg_get_scan_order_table(SCAN_GROUP_UNGROUPED, scan_idx, log2_block_width, log2_block_height);
 
   const coeff_t* coeff = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, lcu_px.x, lcu_px.y)];
 
@@ -217,6 +217,9 @@ static void derive_mts_constraints(cu_info_t *const pred_cu,
   // ISP_TODO: height
   for (int i = 0; i < width * width; i++) {
     if (coeff[scan[i]]) {
+      // ISP_DEBUG
+      assert(old_scan[i] == scan[i] && "Old scan_cg differs from the new one.");
+      assert(old_scan_cg[i >> log2_cg_size] == scan_cg[i >> log2_cg_size] && "Old scan_cg differs from the new one.");
       scan_pos_last = i;
       sig_coeffgroup_flag[scan_cg[i >> log2_cg_size]] = 1;
     }
diff --git a/src/strategies/generic/encode_coding_tree-generic.c b/src/strategies/generic/encode_coding_tree-generic.c
index dcd88fef..c1d2add9 100644
--- a/src/strategies/generic/encode_coding_tree-generic.c
+++ b/src/strategies/generic/encode_coding_tree-generic.c
@@ -79,10 +79,10 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
   const uint32_t log2_block_width = uvg_g_convert_to_bit[width]+2;
   const uint32_t log2_block_height = uvg_g_convert_to_bit[width] + 2; // ISP_TODO: height
   const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_width][log2_block_width][0] + uvg_g_log2_sbb_size[log2_block_width][log2_block_width][1];
-  //const uint32_t *scan = uvg_g_sig_last_scan[scan_mode][log2_block_size - 1];
-  //const uint32_t *scan_cg = g_sig_last_scan_cg[log2_block_size - 1][scan_mode];
-  const uint32_t* scan = uvg_get_scan_order_table(SCAN_GROUP_NORM, scan_mode, log2_block_width, log2_block_height);
-  const uint32_t* scan_cg = uvg_get_scan_order_table(SCAN_GROUP_COEF, scan_mode, log2_block_width, log2_block_height);
+  const uint32_t *old_scan = uvg_g_sig_last_scan[scan_mode][log2_block_width - 1];
+  const uint32_t *old_scan_cg = g_sig_last_scan_cg[log2_block_width - 1][scan_mode];
+  const uint32_t* scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_mode, log2_block_width, log2_block_height);
+  const uint32_t* scan_cg = uvg_get_scan_order_table(SCAN_GROUP_UNGROUPED, scan_mode, log2_block_width, log2_block_height);
 
 
   // Init base contexts according to block type
@@ -94,6 +94,9 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
 
   for (int i = 0; i < width * width; i++) {
     if (coeff[scan[i]]) {
+      // ISP_DEBUG
+      assert(old_scan[i] == scan[i] && "Old scan_cg differs from the new one.");
+      assert(old_scan_cg[i >> log2_cg_size] == scan_cg[i >> log2_cg_size] && "Old scan_cg differs from the new one.");
       scan_pos_last = i;
       sig_coeffgroup_flag[scan_cg[i >> log2_cg_size]] = 1;
     }

From f9116441da390fe1d6affc141f320d346955807a Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Thu, 11 Aug 2022 13:11:41 +0300
Subject: [PATCH 017/254] [isp] Fix avx2 function call. Missing height
 parameter.

---
 src/strategies/avx2/dct-avx2.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/strategies/avx2/dct-avx2.c b/src/strategies/avx2/dct-avx2.c
index f3c812ed..fea2256d 100644
--- a/src/strategies/avx2/dct-avx2.c
+++ b/src/strategies/avx2/dct-avx2.c
@@ -1622,6 +1622,7 @@ static void mts_idct_avx2(
   const color_t color,
   const cu_info_t* tu,
   const int8_t width,
+  const int8_t height,
   const int16_t* input,
   int16_t* output,
   const int8_t mts_idx)

From 3a874ab5dd3b68b20ca48a570bb15a8a1145c27e Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Thu, 11 Aug 2022 13:22:36 +0300
Subject: [PATCH 018/254] [isp] Comment out dct non square function. It is not
 needed since mts dct function will handle transform for non square blocks.

---
 src/strategies/generic/dct-generic.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/strategies/generic/dct-generic.c b/src/strategies/generic/dct-generic.c
index 717f1b5f..5b4c5d7e 100644
--- a/src/strategies/generic/dct-generic.c
+++ b/src/strategies/generic/dct-generic.c
@@ -739,10 +739,11 @@ static void idct_ ## n ## x ## n ## _generic(int8_t bitdepth, const int16_t *inp
   partial_butterfly_inverse_ ## n ## _generic(tmp, output, shift_2nd); \
 }
 
-static void dct_non_square_generic(int8_t bitdepth, const int16_t* input, int16_t* output)
-{
-  // ISP_TODO: non-square transform here
-}
+
+//static void dct_non_square_generic(int8_t bitdepth, const int16_t* input, int16_t* output)
+//{
+//  // ISP_TODO: non-square transform here
+//}
 
 DCT_NXN_GENERIC(4);
 DCT_NXN_GENERIC(8);
@@ -2606,7 +2607,7 @@ int uvg_strategy_register_dct_generic(void* opaque, uint8_t bitdepth)
   success &= uvg_strategyselector_register(opaque, "dct_8x8", "generic", 0, &dct_8x8_generic);
   success &= uvg_strategyselector_register(opaque, "dct_16x16", "generic", 0, &dct_16x16_generic);
   success &= uvg_strategyselector_register(opaque, "dct_32x32", "generic", 0, &dct_32x32_generic);
-  success &= uvg_strategyselector_register(opaque, "dct_non_square", "generic", 0, &dct_non_square_generic);
+  //success &= uvg_strategyselector_register(opaque, "dct_non_square", "generic", 0, &dct_non_square_generic);
 
   success &= uvg_strategyselector_register(opaque, "fast_inverse_dst_4x4", "generic", 0, &fast_inverse_dst_4x4_generic);
 

From 370bd07c557ccee5f35fb0c47845a29cabce0413 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Thu, 11 Aug 2022 13:55:04 +0300
Subject: [PATCH 019/254] [isp] Fix error in mts dct and idct.

---
 src/strategies/avx2/dct-avx2.c       | 4 ++--
 src/strategies/generic/dct-generic.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/strategies/avx2/dct-avx2.c b/src/strategies/avx2/dct-avx2.c
index fea2256d..ada4c9fe 100644
--- a/src/strategies/avx2/dct-avx2.c
+++ b/src/strategies/avx2/dct-avx2.c
@@ -1601,7 +1601,7 @@ static void mts_dct_avx2(
 
   uvg_get_tr_type(width, color, tu, &type_hor, &type_ver, mts_idx);
 
-  if (type_hor == DCT2 && type_ver == DCT2 && !tu->lfnst_idx)
+  if (type_hor == DCT2 && type_ver == DCT2 && !tu->lfnst_idx && width == height)
   {
     dct_func* dct_func = uvg_get_dct_func(width, height, color, tu->type);
     dct_func(bitdepth, input, output);
@@ -1632,7 +1632,7 @@ static void mts_idct_avx2(
 
   uvg_get_tr_type(width, color, tu, &type_hor, &type_ver, mts_idx);
 
-  if (type_hor == DCT2 && type_ver == DCT2)
+  if (type_hor == DCT2 && type_ver == DCT2 && width == height)
   {
     dct_func* idct_func = uvg_get_idct_func(width, color, tu->type);
     idct_func(bitdepth, input, output);
diff --git a/src/strategies/generic/dct-generic.c b/src/strategies/generic/dct-generic.c
index 5b4c5d7e..c9074132 100644
--- a/src/strategies/generic/dct-generic.c
+++ b/src/strategies/generic/dct-generic.c
@@ -2504,7 +2504,7 @@ static void mts_dct_generic(
   // ISP_TODO: height
   uvg_get_tr_type(width, color, tu, &type_hor, &type_ver, mts_idx);
 
-  if (type_hor == DCT2 && type_ver == DCT2 && !tu->lfnst_idx && !tu->cr_lfnst_idx || width == height)
+  if (type_hor == DCT2 && type_ver == DCT2 && !tu->lfnst_idx && !tu->cr_lfnst_idx && width == height)
   {
     dct_func *dct_func = uvg_get_dct_func(width, height, color, tu->type);
     dct_func(bitdepth, input, output);
@@ -2558,7 +2558,7 @@ static void mts_idct_generic(
   // ISP_TODO: height
   uvg_get_tr_type(width, color, tu, &type_hor, &type_ver, mts_idx);
 
-  if (type_hor == DCT2 && type_ver == DCT2 && !tu->lfnst_idx && !tu->cr_lfnst_idx || width == height)
+  if (type_hor == DCT2 && type_ver == DCT2 && !tu->lfnst_idx && !tu->cr_lfnst_idx && width == height)
   {
     dct_func *idct_func = uvg_get_idct_func(width, color, tu->type);
     idct_func(bitdepth, input, output);

From 8b7d573ae729f59a2abd108f3e30f946b831d54e Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Thu, 11 Aug 2022 14:30:04 +0300
Subject: [PATCH 020/254] [isp] Add height to idct getter function. Check block
 dimensions in transform 2d functions.

---
 src/strategies/avx2/dct-avx2.c       | 2 +-
 src/strategies/generic/dct-generic.c | 2 +-
 src/strategies/strategies-dct.c      | 9 +++++++--
 src/strategies/strategies-dct.h      | 2 +-
 src/transform.c                      | 5 ++---
 5 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/src/strategies/avx2/dct-avx2.c b/src/strategies/avx2/dct-avx2.c
index ada4c9fe..61e08839 100644
--- a/src/strategies/avx2/dct-avx2.c
+++ b/src/strategies/avx2/dct-avx2.c
@@ -1634,7 +1634,7 @@ static void mts_idct_avx2(
 
   if (type_hor == DCT2 && type_ver == DCT2 && width == height)
   {
-    dct_func* idct_func = uvg_get_idct_func(width, color, tu->type);
+    dct_func* idct_func = uvg_get_idct_func(width, height, color, tu->type);
     idct_func(bitdepth, input, output);
   }
   else
diff --git a/src/strategies/generic/dct-generic.c b/src/strategies/generic/dct-generic.c
index c9074132..ff221180 100644
--- a/src/strategies/generic/dct-generic.c
+++ b/src/strategies/generic/dct-generic.c
@@ -2560,7 +2560,7 @@ static void mts_idct_generic(
 
   if (type_hor == DCT2 && type_ver == DCT2 && !tu->lfnst_idx && !tu->cr_lfnst_idx && width == height)
   {
-    dct_func *idct_func = uvg_get_idct_func(width, color, tu->type);
+    dct_func *idct_func = uvg_get_idct_func(width, height, color, tu->type);
     idct_func(bitdepth, input, output);
   }
   else
diff --git a/src/strategies/strategies-dct.c b/src/strategies/strategies-dct.c
index e7cc37e9..8441dfdd 100644
--- a/src/strategies/strategies-dct.c
+++ b/src/strategies/strategies-dct.c
@@ -99,7 +99,7 @@ dct_func * uvg_get_dct_func(int8_t width, int8_t height, color_t color, cu_type_
   if (width != height) {
     // Non-square block. Return generic dct for non-square blokcs.
     assert(false && "This should never be called at this point. Non-square stuff is done inside mts_dct function.");
-    return uvg_dct_non_square;
+    //return uvg_dct_non_square;
   }
   switch (width) {
   case 4:
@@ -128,8 +128,13 @@ dct_func * uvg_get_dct_func(int8_t width, int8_t height, color_t color, cu_type_
  *
  * \returns Pointer to the function.
  */
-dct_func * uvg_get_idct_func(int8_t width, color_t color, cu_type_t type)
+dct_func * uvg_get_idct_func(int8_t width, int8_t height, color_t color, cu_type_t type)
 {
+  if (width != height) {
+    // Non-square block. Return generic dct for non-square blokcs.
+    assert(false && "This should never be called at this point. Non-square stuff is done inside mts_idct function.");
+    //return uvg_idct_non_square;
+  }
   switch (width) {
   case 4:
     //if (color == COLOR_Y && type == CU_INTRA) {
diff --git a/src/strategies/strategies-dct.h b/src/strategies/strategies-dct.h
index 59e05084..b883b3e5 100644
--- a/src/strategies/strategies-dct.h
+++ b/src/strategies/strategies-dct.h
@@ -86,7 +86,7 @@ extern mts_idct_func* uvg_mts_idct;
 
 int uvg_strategy_register_dct(void* opaque, uint8_t bitdepth);
 dct_func * uvg_get_dct_func(int8_t width, int8_t height, color_t color, cu_type_t type);
-dct_func * uvg_get_idct_func(int8_t width, color_t color, cu_type_t type);
+dct_func * uvg_get_idct_func(int8_t width, int8_t height, color_t color, cu_type_t type);
 
 
 
diff --git a/src/transform.c b/src/transform.c
index 542ab3de..4a1b4042 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -271,14 +271,13 @@ void uvg_itransform2d(const encoder_control_t * const encoder,
                       color_t color,
                       const cu_info_t *tu)
 {
-  if (encoder->cfg.mts)
+  if (encoder->cfg.mts || block_width != block_height)
   {
     uvg_mts_idct(encoder->bitdepth, color, tu, block_width, block_height, coeff, block, encoder->cfg.mts);
   }
   else
   {
-    // ISP_TODO: block height
-    dct_func *idct_func = uvg_get_idct_func(block_width, color, tu->type);
+    dct_func *idct_func = uvg_get_idct_func(block_width, block_height, color, tu->type);
     idct_func(encoder->bitdepth, coeff, block);
   }
 }

From 09b905c6c4f9269656f12a4d926fc8714fe9babe Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Thu, 11 Aug 2022 14:41:21 +0300
Subject: [PATCH 021/254] [isp] Add height to get_tr_type function.

---
 src/strategies/avx2/dct-avx2.c       | 5 +++--
 src/strategies/generic/dct-generic.c | 8 +++-----
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/src/strategies/avx2/dct-avx2.c b/src/strategies/avx2/dct-avx2.c
index 61e08839..35890e91 100644
--- a/src/strategies/avx2/dct-avx2.c
+++ b/src/strategies/avx2/dct-avx2.c
@@ -1579,6 +1579,7 @@ static tr_func* idct_table[5] = {
 
 extern void uvg_get_tr_type(
   int8_t width,
+  int8_t height,
   color_t color,
   const cu_info_t* tu,
   tr_type_t* hor_out,
@@ -1599,7 +1600,7 @@ static void mts_dct_avx2(
   tr_type_t type_ver;
   // ISP_TODO: height passed but not used
 
-  uvg_get_tr_type(width, color, tu, &type_hor, &type_ver, mts_idx);
+  uvg_get_tr_type(width, height, color, tu, &type_hor, &type_ver, mts_idx);
 
   if (type_hor == DCT2 && type_ver == DCT2 && !tu->lfnst_idx && width == height)
   {
@@ -1630,7 +1631,7 @@ static void mts_idct_avx2(
   tr_type_t type_hor;
   tr_type_t type_ver;
 
-  uvg_get_tr_type(width, color, tu, &type_hor, &type_ver, mts_idx);
+  uvg_get_tr_type(width, height, color, tu, &type_hor, &type_ver, mts_idx);
 
   if (type_hor == DCT2 && type_ver == DCT2 && width == height)
   {
diff --git a/src/strategies/generic/dct-generic.c b/src/strategies/generic/dct-generic.c
index ff221180..c790034f 100644
--- a/src/strategies/generic/dct-generic.c
+++ b/src/strategies/generic/dct-generic.c
@@ -2442,6 +2442,7 @@ static const tr_type_t mts_subset_intra[4][2] = { { DST7, DST7 }, { DCT8, DST7 }
 
 void uvg_get_tr_type(
   int8_t width,
+  int8_t height,
   color_t color,
   const cu_info_t* tu,
   tr_type_t* hor_out,
@@ -2456,7 +2457,6 @@ void uvg_get_tr_type(
     return;
   }
 
-  const int height = width;
   const bool explicit_mts = mts_idx == UVG_MTS_BOTH || (tu->type == CU_INTRA ? mts_idx == UVG_MTS_INTRA : (mts_idx == UVG_MTS_INTER && tu->type == CU_INTER));
   const bool implicit_mts = tu->type == CU_INTRA && (mts_idx == UVG_MTS_IMPLICIT || mts_idx == UVG_MTS_INTER);
 
@@ -2501,8 +2501,7 @@ static void mts_dct_generic(
   tr_type_t type_hor;
   tr_type_t type_ver;
 
-  // ISP_TODO: height
-  uvg_get_tr_type(width, color, tu, &type_hor, &type_ver, mts_idx);
+  uvg_get_tr_type(width, height, color, tu, &type_hor, &type_ver, mts_idx);
 
   if (type_hor == DCT2 && type_ver == DCT2 && !tu->lfnst_idx && !tu->cr_lfnst_idx && width == height)
   {
@@ -2555,8 +2554,7 @@ static void mts_idct_generic(
   tr_type_t type_hor;
   tr_type_t type_ver;
 
-  // ISP_TODO: height
-  uvg_get_tr_type(width, color, tu, &type_hor, &type_ver, mts_idx);
+  uvg_get_tr_type(width, height, color, tu, &type_hor, &type_ver, mts_idx);
 
   if (type_hor == DCT2 && type_ver == DCT2 && !tu->lfnst_idx && !tu->cr_lfnst_idx && width == height)
   {

From 9e7f4eac990b913d1e1c884e2912dc9bebe13d3b Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Fri, 12 Aug 2022 13:35:15 +0300
Subject: [PATCH 022/254] [isp] Change variable name 'type' to 'color'.

---
 src/rdo.c | 47 +++++++++++++++++++++++------------------------
 1 file changed, 23 insertions(+), 24 deletions(-)

diff --git a/src/rdo.c b/src/rdo.c
index dfb2f06f..2fdafc56 100644
--- a/src/rdo.c
+++ b/src/rdo.c
@@ -1383,7 +1383,7 @@ void uvg_rdoq(
   coeff_t *dest_coeff,
   int32_t width,
   int32_t height,
-  int8_t type,
+  int8_t color,
   int8_t scan_mode,
   int8_t block_type,
   int8_t tr_depth,
@@ -1392,6 +1392,7 @@ void uvg_rdoq(
 {
   const encoder_control_t * const encoder = state->encoder_control;
   cabac_data_t * const cabac = &state->cabac;
+  // ISP_TODO: these dimensions can be removed, they are same as log2_block_dimensions
   uint32_t log2_tr_width      = uvg_math_floor_log2(width);
   uint32_t log2_tr_height      = uvg_math_floor_log2(height);
   int32_t  transform_shift   = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_height + log2_tr_width) >> 1);  // Represents scaling through forward transform
@@ -1399,13 +1400,13 @@ void uvg_rdoq(
   uint32_t reg_bins = (width * height * 28) >> 4;
   const uint32_t log2_block_width   = uvg_g_convert_to_bit[width] + 2;
   const uint32_t log2_block_height = uvg_g_convert_to_bit[height] + 2;
-  int32_t  scalinglist_type= (block_type == CU_INTRA ? 0 : 3) + type;
+  int32_t  scalinglist_type= (block_type == CU_INTRA ? 0 : 3) + color;
 
-  int32_t qp_scaled = uvg_get_scaled_qp(type, state->qp, (encoder->bitdepth - 8) * 6, encoder->qp_map[0]);
+  int32_t qp_scaled = uvg_get_scaled_qp(color, state->qp, (encoder->bitdepth - 8) * 6, encoder->qp_map[0]);
   
   int32_t q_bits = QUANT_SHIFT + qp_scaled/6 + transform_shift;
 
-  const double lambda = type ? state->c_lambda : state->lambda;
+  const double lambda = color ? state->c_lambda : state->lambda;
 
   const int32_t *quant_coeff  = encoder->scaling_list.quant_coeff[log2_tr_width][log2_tr_height][scalinglist_type][qp_scaled%6];
   const double *err_scale     = encoder->scaling_list.error_scale[log2_tr_width][log2_tr_height][scalinglist_type][qp_scaled%6];
@@ -1423,7 +1424,7 @@ void uvg_rdoq(
   memset(dest_coeff, 0, sizeof(coeff_t) * width * height);
 
   // ISP_TODO: height
-  const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_width][log2_block_width][0] + uvg_g_log2_sbb_size[log2_block_width][log2_block_width][1];
+  const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_width][log2_block_height][0] + uvg_g_log2_sbb_size[log2_block_width][log2_block_height][1];
 
   const uint32_t cg_width = (MIN((uint8_t)32, width) >> (log2_cg_size / 2));
 
@@ -1444,8 +1445,6 @@ void uvg_rdoq(
   int32_t temp_diag = -1;
   int32_t temp_sum = -1;
 
-  
-  
   int32_t cg_last_scanpos = -1;
   int32_t last_scanpos = -1;
 
@@ -1461,9 +1460,9 @@ void uvg_rdoq(
     default: assert(0 && "There should be 1, 4, 16 or 64 coefficient groups");
   }
 
-  cabac_ctx_t *base_coeff_group_ctx = &(cabac->ctx.sig_coeff_group_model[type ? 2 : 0]);
-  cabac_ctx_t *baseCtx              = (type == 0) ? &(cabac->ctx.cu_sig_model_luma[0][0]) : &(cabac->ctx.cu_sig_model_chroma[0][0]);
-  cabac_ctx_t* base_gt1_ctx = (type == 0) ? &(cabac->ctx.cu_gtx_flag_model_luma[1][0]) : &(cabac->ctx.cu_gtx_flag_model_chroma[1][0]);
+  cabac_ctx_t *base_coeff_group_ctx = &(cabac->ctx.sig_coeff_group_model[color ? 2 : 0]);
+  cabac_ctx_t *baseCtx              = (color == 0) ? &(cabac->ctx.cu_sig_model_luma[0][0]) : &(cabac->ctx.cu_sig_model_chroma[0][0]);
+  cabac_ctx_t* base_gt1_ctx = (color == 0) ? &(cabac->ctx.cu_gtx_flag_model_luma[1][0]) : &(cabac->ctx.cu_gtx_flag_model_chroma[1][0]);
 
   struct {
     double coded_level_and_dist;
@@ -1550,11 +1549,11 @@ void uvg_rdoq(
         
         uint16_t ctx_sig = 0;
         if (scanpos != last_scanpos) {
-          ctx_sig = uvg_context_get_sig_ctx_idx_abs(dest_coeff, pos_x, pos_y, width, height, type, &temp_diag, &temp_sum);
+          ctx_sig = uvg_context_get_sig_ctx_idx_abs(dest_coeff, pos_x, pos_y, width, height, color, &temp_diag, &temp_sum);
         }
         
         if (temp_diag != -1) {
-          ctx_set = (MIN(temp_sum, 4) + 1) + (!temp_diag ? ((type == 0) ? 15 : 5) : (type == 0) ? temp_diag < 3 ? 10 : (temp_diag < 10 ? 5 : 0) : 0);
+          ctx_set = (MIN(temp_sum, 4) + 1) + (!temp_diag ? ((color == 0) ? 15 : 5) : (color == 0) ? temp_diag < 3 ? 10 : (temp_diag < 10 ? 5 : 0) : 0);
         }
         else ctx_set = 0;
 
@@ -1570,12 +1569,12 @@ void uvg_rdoq(
         if (scanpos == last_scanpos) {
           level = uvg_get_coded_level(state, &cost_coeff[scanpos], &cost_coeff0[scanpos], &cost_sig[scanpos],
             level_double, max_abs_level, 0, gt1_ctx, gt2_ctx, par_ctx, go_rice_param,
-            reg_bins, q_bits, temp, 1, type);          
+            reg_bins, q_bits, temp, 1, color);          
         }
         else {
           level = uvg_get_coded_level(state, &cost_coeff[scanpos], &cost_coeff0[scanpos], &cost_sig[scanpos],
             level_double, max_abs_level, ctx_sig, gt1_ctx, gt2_ctx, par_ctx, go_rice_param,
-            reg_bins, q_bits, temp, 0, type);
+            reg_bins, q_bits, temp, 0, color);
           if (encoder->cfg.signhide_enable) {
             int greater_than_zero = CTX_ENTROPY_BITS(&baseCtx[ctx_sig], 1);
             int zero = CTX_ENTROPY_BITS(&baseCtx[ctx_sig], 0);
@@ -1588,14 +1587,14 @@ void uvg_rdoq(
         if (encoder->cfg.signhide_enable) {
           sh_rates.quant_delta[blkpos] = (level_double - level * (1 << q_bits)) >> (q_bits - 8);
           if (level > 0) {
-            int32_t rate_now = uvg_get_ic_rate(state, level, gt1_ctx, gt2_ctx, par_ctx, go_rice_param, reg_bins, type, false);
-            sh_rates.inc[blkpos] = uvg_get_ic_rate(state, level + 1, gt1_ctx, gt2_ctx, par_ctx, go_rice_param, reg_bins, type, false) - rate_now;
-            sh_rates.dec[blkpos] = uvg_get_ic_rate(state, level - 1, gt1_ctx, gt2_ctx, par_ctx, go_rice_param, reg_bins, type, false) - rate_now;
+            int32_t rate_now = uvg_get_ic_rate(state, level, gt1_ctx, gt2_ctx, par_ctx, go_rice_param, reg_bins, color, false);
+            sh_rates.inc[blkpos] = uvg_get_ic_rate(state, level + 1, gt1_ctx, gt2_ctx, par_ctx, go_rice_param, reg_bins, color, false) - rate_now;
+            sh_rates.dec[blkpos] = uvg_get_ic_rate(state, level - 1, gt1_ctx, gt2_ctx, par_ctx, go_rice_param, reg_bins, color, false) - rate_now;
           }
           else { // level == 0
             if (reg_bins < 4) {
-              int32_t rate_now = uvg_get_ic_rate(state, level, gt1_ctx, gt2_ctx, par_ctx, go_rice_param, reg_bins, type, false);
-              sh_rates.inc[blkpos] = uvg_get_ic_rate(state, level + 1, gt1_ctx, gt2_ctx, par_ctx, go_rice_param, reg_bins, type, false) - rate_now;
+              int32_t rate_now = uvg_get_ic_rate(state, level, gt1_ctx, gt2_ctx, par_ctx, go_rice_param, reg_bins, color, false);
+              sh_rates.inc[blkpos] = uvg_get_ic_rate(state, level + 1, gt1_ctx, gt2_ctx, par_ctx, go_rice_param, reg_bins, color, false) - rate_now;
             }
             else {
               sh_rates.inc[blkpos] = CTX_ENTROPY_BITS(&base_gt1_ctx[gt1_ctx], 0);
@@ -1695,12 +1694,12 @@ void uvg_rdoq(
   int8_t found_last        = 0;
   int32_t best_last_idx_p1 = 0;
 
-  if( block_type != CU_INTRA && !type ) {
+  if( block_type != CU_INTRA && !color ) {
     best_cost  = block_uncoded_cost +  lambda * CTX_ENTROPY_BITS(&(cabac->ctx.cu_qt_root_cbf_model),0);
     base_cost +=   lambda * CTX_ENTROPY_BITS(&(cabac->ctx.cu_qt_root_cbf_model),1);
   } else {
     cabac_ctx_t* base_cbf_model = NULL;
-    switch (type) {
+    switch (color) {
       case COLOR_Y:
         base_cbf_model = cabac->ctx.qt_cbf_model_luma;
         break;
@@ -1714,12 +1713,12 @@ void uvg_rdoq(
         assert(0);
     }
     // ISP_TODO: does height affect ctx_cbf? Do this when fixing other cbf stuff
-    ctx_cbf    = ( type != COLOR_V ? 0 : cbf_is_set(cbf, 5 - uvg_math_floor_log2(width), COLOR_U));
+    ctx_cbf    = ( color != COLOR_V ? 0 : cbf_is_set(cbf, 5 - uvg_math_floor_log2(width), COLOR_U));
     best_cost  = block_uncoded_cost +  lambda * CTX_ENTROPY_BITS(&base_cbf_model[ctx_cbf],0);
     base_cost +=   lambda * CTX_ENTROPY_BITS(&base_cbf_model[ctx_cbf],1);
   }
 
-  calc_last_bits(state, width, height, type, last_x_bits, last_y_bits);
+  calc_last_bits(state, width, height, color, last_x_bits, last_y_bits);
   for ( int32_t cg_scanpos = cg_last_scanpos; cg_scanpos >= 0; cg_scanpos--) {
     uint32_t cg_blkpos = scan_cg[cg_scanpos];
     base_cost -= cost_coeffgroup_sig[cg_scanpos];
@@ -1768,7 +1767,7 @@ void uvg_rdoq(
   }
 
   if (encoder->cfg.signhide_enable && abs_sum >= 2) {
-    uvg_rdoq_sign_hiding(state, qp_scaled, scan, &sh_rates, best_last_idx_p1, coef, dest_coeff, type);
+    uvg_rdoq_sign_hiding(state, qp_scaled, scan, &sh_rates, best_last_idx_p1, coef, dest_coeff, color);
   }
 }
 

From 936256e750ac665743805e4b8eaf6730da670978 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Fri, 12 Aug 2022 14:38:33 +0300
Subject: [PATCH 023/254] [isp] Fix sig coeff flag context calculation function
 call. Width & height was swapped.

---
 src/context.c | 2 +-
 src/context.h | 2 +-
 src/rdo.c     | 3 ++-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/context.c b/src/context.c
index 83bd5502..c83f19b3 100644
--- a/src/context.c
+++ b/src/context.c
@@ -656,7 +656,7 @@ uint32_t uvg_context_get_sig_coeff_group_ts(uint32_t* sig_coeff_group_flag,
 * \returns context index for current scan position
 */
 uint32_t uvg_context_get_sig_ctx_idx_abs(const coeff_t* coeff, uint32_t pos_x, uint32_t pos_y,
-                                         uint32_t height, uint32_t width, int8_t type,
+                                         uint32_t width, uint32_t height, int8_t type,
                                          int32_t* temp_diag, int32_t* temp_sum)
 {
   const coeff_t* data = coeff + pos_x + pos_y * width;
diff --git a/src/context.h b/src/context.h
index 366a438a..5155ebd3 100644
--- a/src/context.h
+++ b/src/context.h
@@ -52,7 +52,7 @@ void uvg_context_copy(encoder_state_t * target_state, const encoder_state_t * so
 uint32_t uvg_context_get_sig_coeff_group( uint32_t *sig_coeff_group_flag,uint32_t pos_x, uint32_t pos_y,int32_t width);
 uint32_t uvg_context_get_sig_coeff_group_ts(uint32_t* sig_coeff_group_flag, uint32_t pos_x, uint32_t pos_y, int32_t width);
 uint32_t uvg_context_get_sig_ctx_idx_abs(const coeff_t* coeff, uint32_t pos_x, uint32_t pos_y,
-                                         uint32_t height, uint32_t width, int8_t type, 
+                                         uint32_t width, uint32_t height, int8_t type, 
                                          int32_t* temp_diag, int32_t* temp_sum);
 
 uint32_t uvg_context_get_sig_ctx_idx_abs_ts(const coeff_t* coeff, int32_t pos_x, int32_t pos_y,
diff --git a/src/rdo.c b/src/rdo.c
index 2fdafc56..fafc8a41 100644
--- a/src/rdo.c
+++ b/src/rdo.c
@@ -1543,12 +1543,13 @@ void uvg_rdoq(
       if (last_scanpos >= 0) {
 
         uint32_t  pos_y = blkpos >> log2_block_width;
-        uint32_t  pos_x = blkpos - (pos_y << log2_block_width); // ISP_TODO: height
+        uint32_t  pos_x = blkpos - (pos_y << log2_block_width);
         //===== coefficient level estimation =====
         int32_t  level;
         
         uint16_t ctx_sig = 0;
         if (scanpos != last_scanpos) {
+          // VVC document 9.3.4.2.8, context for sig_coeff_flag calculated here
           ctx_sig = uvg_context_get_sig_ctx_idx_abs(dest_coeff, pos_x, pos_y, width, height, color, &temp_diag, &temp_sum);
         }
         

From 31c8f1356f52e4a76a59ef8c48e9c92ebc20b540 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Fri, 12 Aug 2022 15:28:30 +0300
Subject: [PATCH 024/254] [isp] Add height to sig coeff group context
 calculation function.

---
 src/context.c                                       |  5 +++--
 src/context.h                                       |  2 +-
 src/rdo.c                                           | 13 ++++++++-----
 src/strategies/generic/encode_coding_tree-generic.c |  7 ++++++-
 4 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/src/context.c b/src/context.c
index c83f19b3..31124b02 100644
--- a/src/context.c
+++ b/src/context.c
@@ -618,13 +618,14 @@ void uvg_context_copy(encoder_state_t * const target_state, const encoder_state_
 uint32_t uvg_context_get_sig_coeff_group( uint32_t *sig_coeff_group_flag,
                                       uint32_t pos_x,
                                       uint32_t pos_y,
-                                      int32_t width)
+                                      int32_t width,
+                                      int32_t height)
 {
   uint32_t uiRight = 0;
   uint32_t uiLower = 0;
   uint32_t position = pos_y * width + pos_x;
   if (pos_x + 1 < (uint32_t)width) uiRight = sig_coeff_group_flag[position + 1];
-  if (pos_y + 1 < (uint32_t)width) uiLower = sig_coeff_group_flag[position + width];
+  if (pos_y + 1 < (uint32_t)height) uiLower = sig_coeff_group_flag[position + width];
 
   return uiRight || uiLower;
 }
diff --git a/src/context.h b/src/context.h
index 5155ebd3..3f342409 100644
--- a/src/context.h
+++ b/src/context.h
@@ -49,7 +49,7 @@ void uvg_init_contexts(encoder_state_t *state, int8_t QP, int8_t slice);
 
 void uvg_context_copy(encoder_state_t * target_state, const encoder_state_t * source_state);
 
-uint32_t uvg_context_get_sig_coeff_group( uint32_t *sig_coeff_group_flag,uint32_t pos_x, uint32_t pos_y,int32_t width);
+uint32_t uvg_context_get_sig_coeff_group( uint32_t *sig_coeff_group_flag,uint32_t pos_x, uint32_t pos_y,int32_t width, int32_t height);
 uint32_t uvg_context_get_sig_coeff_group_ts(uint32_t* sig_coeff_group_flag, uint32_t pos_x, uint32_t pos_y, int32_t width);
 uint32_t uvg_context_get_sig_ctx_idx_abs(const coeff_t* coeff, uint32_t pos_x, uint32_t pos_y,
                                          uint32_t width, uint32_t height, int8_t type, 
diff --git a/src/rdo.c b/src/rdo.c
index fafc8a41..4eab70c2 100644
--- a/src/rdo.c
+++ b/src/rdo.c
@@ -1425,8 +1425,11 @@ void uvg_rdoq(
 
   // ISP_TODO: height
   const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_width][log2_block_height][0] + uvg_g_log2_sbb_size[log2_block_width][log2_block_height][1];
+  const uint32_t log2_cg_width  = uvg_g_log2_sbb_size[log2_block_width][log2_block_height][0];
+  const uint32_t log2_cg_height = uvg_g_log2_sbb_size[log2_block_width][log2_block_height][1];
 
-  const uint32_t cg_width = (MIN((uint8_t)32, width) >> (log2_cg_size / 2));
+  const uint32_t cg_width  = (MIN((uint8_t)TR_MAX_WIDTH, width) >> log2_cg_width);
+  const uint32_t cg_height = (MIN((uint8_t)TR_MAX_WIDTH, height) >> log2_cg_height);
 
   const uint32_t *old_scan = uvg_g_sig_last_scan[ scan_mode ][ log2_block_width - 1 ];
   const uint32_t *old_scan_cg = g_sig_last_scan_cg[log2_block_width - 1][scan_mode];
@@ -1636,7 +1639,7 @@ void uvg_rdoq(
     if( cg_scanpos ) {
       if (sig_coeffgroup_flag[cg_blkpos] == 0) {
         uint32_t ctx_sig  = uvg_context_get_sig_coeff_group(sig_coeffgroup_flag, cg_pos_x,
-                                                        cg_pos_y, cg_width);
+                                                        cg_pos_y, cg_width, cg_height);
         cost_coeffgroup_sig[cg_scanpos] = lambda *CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig],0);
         base_cost += cost_coeffgroup_sig[cg_scanpos]  - rd_stats.sig_cost;
       } else {
@@ -1652,7 +1655,7 @@ void uvg_rdoq(
 
           // add SigCoeffGroupFlag cost to total cost
           ctx_sig = uvg_context_get_sig_coeff_group(sig_coeffgroup_flag, cg_pos_x,
-            cg_pos_y, cg_width);
+            cg_pos_y, cg_width, cg_height);
 
           cost_coeffgroup_sig[cg_scanpos] = lambda * CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig], 1);
           base_cost += cost_coeffgroup_sig[cg_scanpos];
@@ -1713,7 +1716,7 @@ void uvg_rdoq(
       default:
         assert(0);
     }
-    // ISP_TODO: does height affect ctx_cbf? Do this when fixing other cbf stuff
+    // This cbf should work even with non-square blocks
     ctx_cbf    = ( color != COLOR_V ? 0 : cbf_is_set(cbf, 5 - uvg_math_floor_log2(width), COLOR_U));
     best_cost  = block_uncoded_cost +  lambda * CTX_ENTROPY_BITS(&base_cbf_model[ctx_cbf],0);
     base_cost +=   lambda * CTX_ENTROPY_BITS(&base_cbf_model[ctx_cbf],1);
@@ -1732,7 +1735,7 @@ void uvg_rdoq(
 
         if( dest_coeff[ blkpos ] ) {
           uint32_t   pos_y = blkpos >> log2_block_width;
-          uint32_t   pos_x = blkpos - ( pos_y << log2_block_width ); // ISP_TODO: height
+          uint32_t   pos_x = blkpos - ( pos_y << log2_block_width );
 
           double cost_last = get_rate_last(lambda, pos_x, pos_y, last_x_bits,last_y_bits );
           double totalCost = base_cost + cost_last - cost_sig[ scanpos ];
diff --git a/src/strategies/generic/encode_coding_tree-generic.c b/src/strategies/generic/encode_coding_tree-generic.c
index c1d2add9..7465ffa9 100644
--- a/src/strategies/generic/encode_coding_tree-generic.c
+++ b/src/strategies/generic/encode_coding_tree-generic.c
@@ -148,6 +148,11 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
     int32_t cg_pos_y = cg_blk_pos / (MIN((uint8_t)32, width) >> (log2_cg_size / 2));
     int32_t cg_pos_x = cg_blk_pos - (cg_pos_y * (MIN((uint8_t)32, width) >> (log2_cg_size / 2)));
 
+    const uint32_t log2_cg_width = uvg_g_log2_sbb_size[log2_block_width][log2_block_height][0];
+    const uint32_t log2_cg_height = uvg_g_log2_sbb_size[log2_block_width][log2_block_height][1];
+    const uint32_t cg_width = (MIN((uint8_t)TR_MAX_WIDTH, width) >> log2_cg_width);
+    const uint32_t cg_height = (MIN((uint8_t)TR_MAX_WIDTH, height) >> log2_cg_height);
+
     // !!! residual_coding_subblock() !!!
 
     // Encode significant coeff group flag when not the last or the first
@@ -156,7 +161,7 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
     } else {
       uint32_t sig_coeff_group = (sig_coeffgroup_flag[cg_blk_pos] != 0);
       uint32_t ctx_sig = uvg_context_get_sig_coeff_group(sig_coeffgroup_flag, cg_pos_x,
-        cg_pos_y, (MIN((uint8_t)32, width) >> (log2_cg_size / 2)));
+        cg_pos_y, cg_width, cg_height);
       CABAC_FBITS_UPDATE(cabac, &base_coeff_group_ctx[ctx_sig], sig_coeff_group, bits, "significant_coeffgroup_flag");
     }
 

From 50ad91a94e65bb4a689d64e55fb474b203a663b2 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Mon, 15 Aug 2022 18:42:49 +0300
Subject: [PATCH 025/254] [isp] Modify quantization functions to work with
 non-square blocks.

---
 src/strategies/avx2/quant-avx2.c       | 11 +++++++----
 src/strategies/generic/quant-generic.c | 12 +++++++-----
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/src/strategies/avx2/quant-avx2.c b/src/strategies/avx2/quant-avx2.c
index 664933a8..125bfdf2 100644
--- a/src/strategies/avx2/quant-avx2.c
+++ b/src/strategies/avx2/quant-avx2.c
@@ -380,13 +380,15 @@ void uvg_quant_avx2(const encoder_state_t * const state, const coeff_t * __restr
   int32_t height, color_t color, int8_t scan_idx, int8_t block_type, int8_t transform_skip, uint8_t lfnst_idx)
 {
   const encoder_control_t * const encoder = state->encoder_control;
-  const uint32_t log2_block_size = uvg_g_convert_to_bit[width] + 2;
-  const uint32_t * const scan = uvg_g_sig_last_scan[scan_idx][log2_block_size - 1];
+  //const uint32_t log2_block_size = uvg_g_convert_to_bit[width] + 2;
+  //const uint32_t * const old_scan = uvg_g_sig_last_scan[scan_idx][log2_block_size - 1];
+  const uint32_t log2_tr_width = uvg_g_convert_to_bit[width] + 2;
+  const uint32_t log2_tr_height = uvg_g_convert_to_bit[height] + 2;
+  const uint32_t* const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_idx, log2_tr_width, log2_tr_height);
 
   int32_t qp_scaled = uvg_get_scaled_qp(color, state->qp, (encoder->bitdepth - 8) * 6, encoder->qp_map[0]);
   qp_scaled = transform_skip ? MAX(qp_scaled, 4 + 6 * MIN_QP_PRIME_TS) : qp_scaled;
-  uint32_t log2_tr_width = uvg_math_floor_log2(height);
-  uint32_t log2_tr_height = uvg_math_floor_log2(width);
+  
   const int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)color;
   const int32_t *quant_coeff = encoder->scaling_list.quant_coeff[log2_tr_width][log2_tr_height][scalinglist_type][qp_scaled % 6];
   const int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_width + log2_tr_height) >> 1); //!< Represents scaling through forward transform
@@ -501,6 +503,7 @@ void uvg_quant_avx2(const encoder_state_t * const state, const coeff_t * __restr
     __m256i v_coef, q_coefs;
     __m256i v_quant_coeff_lo, v_quant_coeff_hi;
 
+    // ISP_TODO: do these avx common functions need height?
     scanord_read_vector(coeffs, scan, scan_idx, subpos, width, result_coeffs, 2);
 
     v_coef  = result_coeffs[0];
diff --git a/src/strategies/generic/quant-generic.c b/src/strategies/generic/quant-generic.c
index 81bf0892..3cfc3194 100644
--- a/src/strategies/generic/quant-generic.c
+++ b/src/strategies/generic/quant-generic.c
@@ -62,14 +62,16 @@ void uvg_quant_generic(
   uint8_t lfnst_idx)
 {
   const encoder_control_t * const encoder = state->encoder_control;
-  const uint32_t log2_block_size = uvg_g_convert_to_bit[width] + 2;
-  // ISP_TODO: width & height affect scan order
-  const uint32_t * const scan = uvg_g_sig_last_scan[scan_idx][log2_block_size - 1];
+  const uint32_t log2_tr_width = uvg_g_convert_to_bit[width] + 2;
+  const uint32_t log2_tr_height = uvg_g_convert_to_bit[height] + 2;
+  //const uint32_t log2_block_size = uvg_g_convert_to_bit[width] + 2;
+  //const uint32_t * const old_scan = uvg_g_sig_last_scan[scan_idx][log2_block_size - 1];
+  const uint32_t * const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_idx, log2_tr_width, log2_tr_height);
 
   int32_t qp_scaled = uvg_get_scaled_qp(color, state->qp, (encoder->bitdepth - 8) * 6, encoder->qp_map[0]);
   qp_scaled = transform_skip ? MAX(qp_scaled, 4 + 6 * MIN_QP_PRIME_TS) : qp_scaled;
-  uint32_t log2_tr_width = uvg_math_floor_log2(height);
-  uint32_t log2_tr_height = uvg_math_floor_log2(width);
+  
+  
   const int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)color;
   const int32_t *quant_coeff = encoder->scaling_list.quant_coeff[log2_tr_width][log2_tr_height][scalinglist_type][qp_scaled % 6];
   const int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_height + log2_tr_width) >> 1); //!< Represents scaling through forward transform

From 6922157ed33fcd6012f1f15ab111a9e4616cbc3e Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Tue, 16 Aug 2022 14:01:11 +0300
Subject: [PATCH 026/254] [isp] Fix quantization function calls. Some were not
 getting height as input.

---
 src/strategies/avx2/quant-avx2.c       | 6 +++---
 src/strategies/generic/quant-generic.c | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/strategies/avx2/quant-avx2.c b/src/strategies/avx2/quant-avx2.c
index 125bfdf2..babb3d1f 100644
--- a/src/strategies/avx2/quant-avx2.c
+++ b/src/strategies/avx2/quant-avx2.c
@@ -685,15 +685,15 @@ int uvg_quantize_residual_avx2(encoder_state_t *const state,
   {
     int8_t tr_depth = cur_cu->tr_depth - cur_cu->depth;
     tr_depth += (cur_cu->part_size == SIZE_NxN ? 1 : 0);
-    uvg_rdoq(state, coeff, coeff_out, width, width, color,
+    uvg_rdoq(state, coeff, coeff_out, width, height, color,
       scan_order, cur_cu->type, tr_depth, cur_cu->cbf, lfnst_index);
   }
   else if (state->encoder_control->cfg.rdoq_enable && use_trskip) {
-    uvg_ts_rdoq(state, coeff, coeff_out, width, width, color,
+    uvg_ts_rdoq(state, coeff, coeff_out, width, height, color,
       scan_order);
   }
   else {
-    uvg_quant(state, coeff, coeff_out, width, width, color,
+    uvg_quant(state, coeff, coeff_out, width, height, color,
       scan_order, cur_cu->type, cur_cu->tr_idx == MTS_SKIP && color == COLOR_Y, lfnst_index);
   }
 
diff --git a/src/strategies/generic/quant-generic.c b/src/strategies/generic/quant-generic.c
index 3cfc3194..a02a5413 100644
--- a/src/strategies/generic/quant-generic.c
+++ b/src/strategies/generic/quant-generic.c
@@ -508,14 +508,14 @@ int uvg_quantize_residual_generic(encoder_state_t *const state,
       scan_order);
   } else {
   
-    uvg_quant(state, coeff, coeff_out, width, width, color,
+    uvg_quant(state, coeff, coeff_out, width, height, color,
       scan_order, cur_cu->type, cur_cu->tr_idx == MTS_SKIP && color == COLOR_Y, lfnst_index);
   }
 
   // Check if there are any non-zero coefficients.
   {
     int i;
-    for (i = 0; i < width * width; ++i) {
+    for (i = 0; i < width * height; ++i) {
       if (coeff_out[i] != 0) {
         has_coeffs = 1;
         break;

From 318d925028c178acc98b92ad9722a30f6178be4a Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Tue, 16 Aug 2022 15:00:15 +0300
Subject: [PATCH 027/254] [isp] Add new convert_to_log2 table. Change all
 instances which used old convert_to_bit table to change dimensions into log2.

---
 src/encode_coding_tree.c                         |  8 ++++----
 src/intra.c                                      | 16 ++++++++--------
 src/rdo.c                                        |  8 ++++----
 src/search_intra.c                               |  8 ++++----
 src/strategies/avx2/intra-avx2.c                 | 12 ++++++------
 src/strategies/avx2/quant-avx2.c                 |  6 ++++--
 .../generic/encode_coding_tree-generic.c         |  4 ++--
 src/strategies/generic/intra-generic.c           | 12 ++++++------
 src/strategies/generic/quant-generic.c           | 14 +++++++-------
 src/tables.c                                     |  2 ++
 src/tables.h                                     |  1 +
 src/transform.c                                  |  6 +++---
 12 files changed, 51 insertions(+), 46 deletions(-)

diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index 2469904f..2199f5bc 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -283,8 +283,8 @@ void uvg_encode_ts_residual(encoder_state_t* const state,
 
   // CONSTANTS
 
-  const uint32_t log2_block_width = uvg_g_convert_to_bit[width] + 2;
-  const uint32_t log2_block_height = log2_block_width; // ISP_TODO: height
+  const uint32_t log2_block_width = uvg_g_convert_to_log2[width];
+  const uint32_t log2_block_height = log2_block_width; // TODO: height
   const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_width][log2_block_width][0] + uvg_g_log2_sbb_size[log2_block_width][log2_block_width][1];
   const uint32_t* old_scan =    uvg_g_sig_last_scan[scan_mode][log2_block_width - 1];
   const uint32_t* old_scan_cg = g_sig_last_scan_cg[log2_block_width - 1][scan_mode];
@@ -1064,12 +1064,12 @@ void uvg_encode_intra_luma_coding_unit(const encoder_state_t * const state,
   uint32_t width = (LCU_WIDTH >> depth);
   uint32_t height = (LCU_WIDTH >> depth);
 
-  bool enough_samples = uvg_g_convert_to_bit[width] + uvg_g_convert_to_bit[height] > (uvg_g_convert_to_bit[4 /* MIN_TB_SIZEY*/] << 1);
+  // Need at least 16 samples in sub blocks to use isp. If both dimensions are 4, not enough samples. Size cannot be 2 at this point.
+  bool allow_isp = !(width == 4 && height == 4);
   uint8_t isp_mode = 0;
   // ToDo: add height comparison
   //isp_mode += ((width > TR_MAX_WIDTH) || !enough_samples) ? 1 : 0;
   //isp_mode += ((height > TR_MAX_WIDTH) || !enough_samples) ? 2 : 0;
-  bool allow_isp = enough_samples;
 
   // Code MIP related bits
   bool enable_mip = state->encoder_control->cfg.mip;
diff --git a/src/intra.c b/src/intra.c
index cab91005..ea848faa 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -922,8 +922,8 @@ static void intra_predict_regular(
 {
   const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
   const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
-  const int log2_width = uvg_g_convert_to_bit[width] + 2;
-  const int log2_height = uvg_g_convert_to_bit[height] + 2;
+  const int log2_width = uvg_g_convert_to_log2[width];
+  const int log2_height = uvg_g_convert_to_log2[height];
   const uvg_config *cfg = &state->encoder_control->cfg;
 
   // MRL only for luma
@@ -988,8 +988,8 @@ void uvg_intra_build_reference_any(
 {
   const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
   const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
-  const int log2_width = uvg_g_convert_to_bit[width] + 2;
-  const int log2_height = uvg_g_convert_to_bit[height] + 2;
+  const int log2_width =  uvg_g_convert_to_log2[width];
+  const int log2_height = uvg_g_convert_to_log2[height];
 
   assert((log2_width >= 2 && log2_width <= 5) && (log2_height >= 2 && log2_height <= 5));
 
@@ -1201,8 +1201,8 @@ void uvg_intra_build_reference_inner(
 {
   const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
   const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
-  const int log2_width = uvg_g_convert_to_bit[width] + 2;
-  const int log2_height = uvg_g_convert_to_bit[height] + 2;
+  const int log2_width =  uvg_g_convert_to_log2[width];
+  const int log2_height = uvg_g_convert_to_log2[height];
 
   assert((log2_width >= 2 && log2_width <= 5) && (log2_height >= 2 && log2_height <= 5));
 
@@ -1487,8 +1487,8 @@ static void intra_recon_tb_leaf(
   
   const int width  = color == COLOR_Y ? cu_loc->width  : cu_loc->chroma_width;
   const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
-  int log2_width = uvg_g_convert_to_bit[width] + 2;
-  int log2_height = uvg_g_convert_to_bit[height] + 2;
+  int log2_width =  uvg_g_convert_to_log2[width];
+  int log2_height = uvg_g_convert_to_log2[height];
 
   const int lcu_width = LCU_WIDTH >> shift;
 
diff --git a/src/rdo.c b/src/rdo.c
index 4eab70c2..fc4052c4 100644
--- a/src/rdo.c
+++ b/src/rdo.c
@@ -1141,8 +1141,8 @@ int uvg_ts_rdoq(encoder_state_t* const state, coeff_t* src_coeff, coeff_t* dest_
   const int  max_log2_tr_dynamic_range = 15;
   uint32_t log2_tr_width = uvg_math_floor_log2(width);
   uint32_t log2_tr_height = uvg_math_floor_log2(height);
-  const uint32_t log2_block_width = uvg_g_convert_to_bit[width] + 2;
-  const uint32_t log2_block_height = uvg_g_convert_to_bit[height] + 2;
+  const uint32_t log2_block_width  = uvg_g_convert_to_log2[width];
+  const uint32_t log2_block_height = uvg_g_convert_to_log2[height];
   const uint32_t log2_cg_width = g_log2_sbb_size[log2_tr_width][log2_tr_height][0];
   const uint32_t log2_cg_height = g_log2_sbb_size[log2_tr_width][log2_tr_height][1];
 
@@ -1398,8 +1398,8 @@ void uvg_rdoq(
   int32_t  transform_shift   = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_height + log2_tr_width) >> 1);  // Represents scaling through forward transform
   uint16_t go_rice_param     = 0;
   uint32_t reg_bins = (width * height * 28) >> 4;
-  const uint32_t log2_block_width   = uvg_g_convert_to_bit[width] + 2;
-  const uint32_t log2_block_height = uvg_g_convert_to_bit[height] + 2;
+  const uint32_t log2_block_width  = uvg_g_convert_to_log2[width];
+  const uint32_t log2_block_height = uvg_g_convert_to_log2[height];
   int32_t  scalinglist_type= (block_type == CU_INTRA ? 0 : 3) + color;
 
   int32_t qp_scaled = uvg_get_scaled_qp(color, state->qp, (encoder->bitdepth - 8) * 6, encoder->qp_map[0]);
diff --git a/src/search_intra.c b/src/search_intra.c
index 5b348b0e..f36708b7 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -200,8 +200,8 @@ static void derive_mts_constraints(cu_info_t *const pred_cu,
   // ToDo: large block support in VVC?
   uint32_t sig_coeffgroup_flag[32 * 32] = { 0 };
 
-  const uint32_t log2_block_width = uvg_g_convert_to_bit[width] + 2;
-  const uint32_t log2_block_height = uvg_g_convert_to_bit[height] + 2;
+  const uint32_t log2_block_width =  uvg_g_convert_to_log2[width];
+  const uint32_t log2_block_height = uvg_g_convert_to_log2[height];
   const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_width][log2_block_width][0]
     + uvg_g_log2_sbb_size[log2_block_width][log2_block_width][1]; // ISP_TODO: height
   const uint32_t *old_scan = uvg_g_sig_last_scan[scan_idx][log2_block_width - 1];
@@ -263,8 +263,8 @@ static INLINE bool can_use_isp(const int width, const int height, const int max_
   assert(!(width > LCU_WIDTH || height > LCU_WIDTH) && "Block size larger than max LCU size.");
   assert(!(width < TR_MIN_WIDTH || height < TR_MIN_WIDTH) && "Block size smaller than min TR_WIDTH.");
 
-  const int log2_width = uvg_g_convert_to_bit[width] + 2;
-  const int log2_height = uvg_g_convert_to_bit[height] + 2;
+  const int log2_width =  uvg_g_convert_to_log2[width];
+  const int log2_height = uvg_g_convert_to_log2[height];
 
   // Each split block must have at least 16 samples.
   bool not_enough_samples = (log2_width + log2_height <= 4);
diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c
index fc19654a..b25c626c 100644
--- a/src/strategies/avx2/intra-avx2.c
+++ b/src/strategies/avx2/intra-avx2.c
@@ -64,8 +64,8 @@ static void uvg_angular_pred_avx2(
   // ISP_TODO: non-square block implementation, height is passed but not used
   const int width = channel_type == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
   const int height = channel_type == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
-  const int log2_width = uvg_g_convert_to_bit[width] + 2;
-  const int log2_height = uvg_g_convert_to_bit[height] + 2;
+  const int log2_width =  uvg_g_convert_to_log2[width];
+  const int log2_height = uvg_g_convert_to_log2[height];
 
   assert((log2_width >= 2 && log2_width <= 5) && (log2_height >= 2 && log2_height <= 5));
   assert(intra_mode >= 2 && intra_mode <= 66);
@@ -516,8 +516,8 @@ static void uvg_intra_pred_planar_avx2(
   // ISP_TODO: non-square block implementation, height is passed but not used
   const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
   const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
-  const int log2_width = uvg_g_convert_to_bit[width] + 2;
-  const int log2_height = uvg_g_convert_to_bit[height] + 2;
+  const int log2_width =  uvg_g_convert_to_log2[width];
+  const int log2_height = uvg_g_convert_to_log2[height];
 
   assert((log2_width >= 2 && log2_width <= 5) && (log2_height >= 2 && log2_height <= 5));
 
@@ -983,8 +983,8 @@ static void uvg_pdpc_planar_dc_avx2(
   assert(mode == 0 || mode == 1);  // planar or DC
   const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
   const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
-  const int log2_width = uvg_g_convert_to_bit[width] + 2;
-  const int log2_height = uvg_g_convert_to_bit[height] + 2;
+  const int log2_width =  uvg_g_convert_to_log2[width];
+  const int log2_height = uvg_g_convert_to_log2[height];
 
   __m256i shuf_mask_byte = _mm256_setr_epi8(
     0, -1, 0, -1, 0, -1, 0, -1,
diff --git a/src/strategies/avx2/quant-avx2.c b/src/strategies/avx2/quant-avx2.c
index babb3d1f..d7b09378 100644
--- a/src/strategies/avx2/quant-avx2.c
+++ b/src/strategies/avx2/quant-avx2.c
@@ -382,8 +382,8 @@ void uvg_quant_avx2(const encoder_state_t * const state, const coeff_t * __restr
   const encoder_control_t * const encoder = state->encoder_control;
   //const uint32_t log2_block_size = uvg_g_convert_to_bit[width] + 2;
   //const uint32_t * const old_scan = uvg_g_sig_last_scan[scan_idx][log2_block_size - 1];
-  const uint32_t log2_tr_width = uvg_g_convert_to_bit[width] + 2;
-  const uint32_t log2_tr_height = uvg_g_convert_to_bit[height] + 2;
+  const uint32_t log2_tr_width  = uvg_g_convert_to_log2[width];
+  const uint32_t log2_tr_height = uvg_g_convert_to_log2[height];
   const uint32_t* const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_idx, log2_tr_width, log2_tr_height);
 
   int32_t qp_scaled = uvg_get_scaled_qp(color, state->qp, (encoder->bitdepth - 8) * 6, encoder->qp_map[0]);
@@ -768,6 +768,8 @@ void uvg_dequant_avx2(const encoder_state_t * const state, coeff_t *q_coef, coef
   const encoder_control_t * const encoder = state->encoder_control;
   int32_t shift,add,coeff_q;
   int32_t n;
+  const uint32_t log2_tr_width =  uvg_g_convert_to_log2[width];
+  const uint32_t log2_tr_height = uvg_g_convert_to_log2[height];
   int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((uvg_math_floor_log2(width) + uvg_math_floor_log2(height)) >> 1); // Represents scaling through forward transform
 
 
diff --git a/src/strategies/generic/encode_coding_tree-generic.c b/src/strategies/generic/encode_coding_tree-generic.c
index 7465ffa9..189334b5 100644
--- a/src/strategies/generic/encode_coding_tree-generic.c
+++ b/src/strategies/generic/encode_coding_tree-generic.c
@@ -76,8 +76,8 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
   // CONSTANTS
 
   const int height = width; // TODO: height for non-square blocks.
-  const uint32_t log2_block_width = uvg_g_convert_to_bit[width]+2;
-  const uint32_t log2_block_height = uvg_g_convert_to_bit[width] + 2; // ISP_TODO: height
+  const uint32_t log2_block_width =  uvg_g_convert_to_log2[width];
+  const uint32_t log2_block_height = uvg_g_convert_to_log2[height];
   const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_width][log2_block_width][0] + uvg_g_log2_sbb_size[log2_block_width][log2_block_width][1];
   const uint32_t *old_scan = uvg_g_sig_last_scan[scan_mode][log2_block_width - 1];
   const uint32_t *old_scan_cg = g_sig_last_scan_cg[log2_block_width - 1][scan_mode];
diff --git a/src/strategies/generic/intra-generic.c b/src/strategies/generic/intra-generic.c
index 4e050f79..14418f35 100644
--- a/src/strategies/generic/intra-generic.c
+++ b/src/strategies/generic/intra-generic.c
@@ -62,8 +62,8 @@ static void uvg_angular_pred_generic(
 {
   const int width  = channel_type == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
   const int height = channel_type == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
-  const int log2_width = uvg_g_convert_to_bit[width] + 2;
-  const int log2_height = uvg_g_convert_to_bit[height] + 2;
+  const int log2_width  = uvg_g_convert_to_log2[width];
+  const int log2_height = uvg_g_convert_to_log2[height];
   
   assert((log2_width >= 2 && log2_width <= 5) && (log2_height >= 2 && log2_height <= 5));
   assert(intra_mode >= 2 && intra_mode <= 66);
@@ -430,8 +430,8 @@ static void uvg_intra_pred_planar_generic(
 {
   const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
   const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
-  const int log2_width = uvg_g_convert_to_bit[width] + 2;
-  const int log2_height = uvg_g_convert_to_bit[height] + 2;
+  const int log2_width  = uvg_g_convert_to_log2[width];
+  const int log2_height = uvg_g_convert_to_log2[height];
 
   const int offset = 1 << (log2_width + log2_height);
   const int final_shift = 1 + log2_width + log2_height;
@@ -538,8 +538,8 @@ static void uvg_pdpc_planar_dc_generic(
   assert(mode == 0 || mode == 1);  // planar or DC
   const int width =  color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
   const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
-  const int log2_width = uvg_g_convert_to_bit[width] + 2;
-  const int log2_height = uvg_g_convert_to_bit[height] + 2;
+  const int log2_width  = uvg_g_convert_to_log2[width];
+  const int log2_height = uvg_g_convert_to_log2[height];
 
   const int scale = (log2_width + log2_height - 2) >> 2;
 
diff --git a/src/strategies/generic/quant-generic.c b/src/strategies/generic/quant-generic.c
index a02a5413..6ad0412e 100644
--- a/src/strategies/generic/quant-generic.c
+++ b/src/strategies/generic/quant-generic.c
@@ -62,8 +62,8 @@ void uvg_quant_generic(
   uint8_t lfnst_idx)
 {
   const encoder_control_t * const encoder = state->encoder_control;
-  const uint32_t log2_tr_width = uvg_g_convert_to_bit[width] + 2;
-  const uint32_t log2_tr_height = uvg_g_convert_to_bit[height] + 2;
+  const uint32_t log2_tr_width  = uvg_g_convert_to_log2[width];
+  const uint32_t log2_tr_height = uvg_g_convert_to_log2[height];
   //const uint32_t log2_block_size = uvg_g_convert_to_bit[width] + 2;
   //const uint32_t * const old_scan = uvg_g_sig_last_scan[scan_idx][log2_block_size - 1];
   const uint32_t * const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_idx, log2_tr_width, log2_tr_height);
@@ -594,7 +594,9 @@ void uvg_dequant_generic(const encoder_state_t * const state, coeff_t *q_coef, c
   const encoder_control_t * const encoder = state->encoder_control;
   int32_t shift,add,coeff_q;
   int32_t n;
-  int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((uvg_math_floor_log2(width) + uvg_math_floor_log2(height)) >> 1); // Represents scaling through forward transform
+  const uint32_t log2_tr_width  = uvg_g_convert_to_log2[width];
+  const uint32_t log2_tr_height = uvg_g_convert_to_log2[height];
+  int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_width + log2_tr_height) >> 1); // Represents scaling through forward transform
 
 
   int32_t qp_scaled = uvg_get_scaled_qp(color, state->qp, (encoder->bitdepth-8)*6, encoder->qp_map[0]);
@@ -604,11 +606,9 @@ void uvg_dequant_generic(const encoder_state_t * const state, coeff_t *q_coef, c
 
   if (encoder->scaling_list.enable)
   {
-    uint32_t log2_tr_width = uvg_math_floor_log2(height) + 2;
-    uint32_t log2_tr_height = uvg_math_floor_log2(width) + 2;
     int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)(color);
 
-    const int32_t *dequant_coef = encoder->scaling_list.de_quant_coeff[log2_tr_width -2][log2_tr_height -2][scalinglist_type][qp_scaled%6];
+    const int32_t *dequant_coef = encoder->scaling_list.de_quant_coeff[log2_tr_width][log2_tr_height][scalinglist_type][qp_scaled%6];
     shift += 4;
 
     if (shift >qp_scaled / 6) {
@@ -629,7 +629,7 @@ void uvg_dequant_generic(const encoder_state_t * const state, coeff_t *q_coef, c
     int32_t scale = uvg_g_inv_quant_scales[qp_scaled%6] << (qp_scaled/6);
     add = 1 << (shift-1);
 
-    for (n = 0; n < width*height; n++) {
+    for (n = 0; n < width * height; n++) {
       coeff_q   = (q_coef[n] * scale + add) >> shift;
       coef[n] = (coeff_t)CLIP(-32768, 32767, coeff_q);
     }
diff --git a/src/tables.c b/src/tables.c
index 0f5bfe31..0d51f2f4 100644
--- a/src/tables.c
+++ b/src/tables.c
@@ -7,6 +7,8 @@
 #endif
 //                                                                  4              8                              16                                                             32                                                                                                                             64
 const int8_t uvg_g_convert_to_bit[LCU_WIDTH + 1] = {-1, -1, -1, -1, 0, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 4};
+//                                                     0   1   2      4              8                              16                                                             32                                                                                                                             64
+const int8_t uvg_g_convert_to_log2[LCU_WIDTH + 1] = { -1,  0,  1, -1, 2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 6 };
 
 const uint32_t uvg_g_log2_sbb_size[7 + 1][7 + 1][2] =
 //===== luma/chroma =====
diff --git a/src/tables.h b/src/tables.h
index 66b4a2c0..0d52ea87 100644
--- a/src/tables.h
+++ b/src/tables.h
@@ -134,6 +134,7 @@ typedef enum
  */
 extern const uint32_t* const uvg_g_sig_last_scan[3][5];
 extern const int8_t uvg_g_convert_to_bit[LCU_WIDTH + 1];
+extern const int8_t uvg_g_convert_to_log2[LCU_WIDTH + 1];
 extern const uint32_t uvg_g_log2_sbb_size[7 + 1][7 + 1][2];
 
 #define SCAN_GROUP_TYPES 2
diff --git a/src/transform.c b/src/transform.c
index 4a1b4042..bf91b645 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -183,7 +183,7 @@ void uvg_derive_lfnst_constraints(
   coeff_scan_order_t scan_idx = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode, depth);
   // ToDo: large block support in VVC?
 
-  const uint32_t log2_block_size = uvg_g_convert_to_bit[width] + 2;
+  const uint32_t log2_block_size = uvg_g_convert_to_log2[width];
   const uint32_t* scan = uvg_g_sig_last_scan[scan_idx][log2_block_size - 1];
 
   signed scan_pos_last = -1;
@@ -825,7 +825,7 @@ void uvg_fwd_lfnst(
 
   if (lfnst_index && !mts_skip && (is_separate_tree || color == COLOR_Y))
   {
-    const uint32_t log2_block_size = uvg_g_convert_to_bit[width] + 2;
+    const uint32_t log2_block_size = uvg_g_convert_to_log2[width];
     assert(log2_block_size != -1 && "LFNST: invalid block width.");
     const bool whge3 = width >= 8 && height >= 8;
     const uint32_t* scan = whge3 ? uvg_coef_top_left_diag_scan_8x8[log2_block_size] : uvg_g_sig_last_scan[scan_order][log2_block_size - 1];
@@ -961,7 +961,7 @@ void uvg_inv_lfnst(
   const int scan_order = uvg_get_scan_order(cu_type, intra_mode, depth);
   
   if (lfnst_index && !mts_skip && (is_separate_tree || color == COLOR_Y)) {
-    const uint32_t log2_block_size = uvg_g_convert_to_bit[width] + 2;
+    const uint32_t log2_block_size = uvg_g_convert_to_log2[width];
     const bool whge3 = width >= 8 && height >= 8;
     const uint32_t* scan = whge3 ? uvg_coef_top_left_diag_scan_8x8[log2_block_size] : uvg_g_sig_last_scan[scan_order][log2_block_size - 1];
     

From 7c340fd92b321d2be95e68c5d794f0b3a526a11c Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Tue, 16 Aug 2022 15:52:30 +0300
Subject: [PATCH 028/254] [isp] Add height to inverse transform skip.

---
 src/strategies/avx2/quant-avx2.c       |  6 ++----
 src/strategies/generic/quant-generic.c |  4 ++--
 src/transform.c                        | 12 ++++++------
 src/transform.h                        |  2 +-
 4 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/src/strategies/avx2/quant-avx2.c b/src/strategies/avx2/quant-avx2.c
index d7b09378..0a0df48c 100644
--- a/src/strategies/avx2/quant-avx2.c
+++ b/src/strategies/avx2/quant-avx2.c
@@ -716,7 +716,7 @@ int uvg_quantize_residual_avx2(encoder_state_t *const state,
       uvg_inv_lfnst(cur_cu, width, height, color, lfnst_index, coeff, tree_type);
     }
     if (use_trskip) {
-      uvg_itransformskip(state->encoder_control, residual, coeff, width);
+      uvg_itransformskip(state->encoder_control, residual, coeff, width, height);
     }
     else {
       uvg_itransform2d(state->encoder_control, residual, coeff, width, height, color, cur_cu);
@@ -770,7 +770,7 @@ void uvg_dequant_avx2(const encoder_state_t * const state, coeff_t *q_coef, coef
   int32_t n;
   const uint32_t log2_tr_width =  uvg_g_convert_to_log2[width];
   const uint32_t log2_tr_height = uvg_g_convert_to_log2[height];
-  int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((uvg_math_floor_log2(width) + uvg_math_floor_log2(height)) >> 1); // Represents scaling through forward transform
+  int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_width + log2_tr_height) >> 1); // Represents scaling through forward transform
 
 
   int32_t qp_scaled = uvg_get_scaled_qp(color, state->qp, (encoder->bitdepth-8)*6, encoder->qp_map[0]);
@@ -780,8 +780,6 @@ void uvg_dequant_avx2(const encoder_state_t * const state, coeff_t *q_coef, coef
 
   if (encoder->scaling_list.enable)
   {
-    uint32_t log2_tr_width = uvg_math_floor_log2(height) + 2;
-    uint32_t log2_tr_height = uvg_math_floor_log2(width) + 2;
     int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)(color);
 
     const int32_t* dequant_coef = encoder->scaling_list.de_quant_coeff[log2_tr_width - 2][log2_tr_height - 2][scalinglist_type][qp_scaled % 6];
diff --git a/src/strategies/generic/quant-generic.c b/src/strategies/generic/quant-generic.c
index 6ad0412e..d08808cd 100644
--- a/src/strategies/generic/quant-generic.c
+++ b/src/strategies/generic/quant-generic.c
@@ -529,7 +529,7 @@ int uvg_quantize_residual_generic(encoder_state_t *const state,
     int y, x;
 
     // Get quantized residual. (coeff_out -> coeff -> residual)
-    uvg_dequant(state, coeff_out, coeff, width, width, color,
+    uvg_dequant(state, coeff_out, coeff, width, height, color,
       cur_cu->type, cur_cu->tr_idx == MTS_SKIP && color == COLOR_Y);
     
     if (state->encoder_control->cfg.lfnst && cur_cu->type == CU_INTRA) {
@@ -537,7 +537,7 @@ int uvg_quantize_residual_generic(encoder_state_t *const state,
       uvg_inv_lfnst(cur_cu, width, height, color, lfnst_index, coeff, tree_type);
     }
     if (use_trskip) {
-      uvg_itransformskip(state->encoder_control, residual, coeff, width);
+      uvg_itransformskip(state->encoder_control, residual, coeff, width, height);
     }
     else {
       uvg_itransform2d(state->encoder_control, residual, coeff, width, height, color, cur_cu);
diff --git a/src/transform.c b/src/transform.c
index bf91b645..10031f7b 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -228,12 +228,12 @@ void uvg_transformskip(const encoder_control_t * const encoder, int16_t *block,i
  * \param block output data (residual)
  * \param block_size width of transform
  */
-void uvg_itransformskip(const encoder_control_t * const encoder, int16_t *block,int16_t *coeff, int8_t block_size)
+void uvg_itransformskip(const encoder_control_t * const encoder, int16_t *block,int16_t *coeff, int8_t block_width, int8_t block_height)
 {
   int32_t  j,k;
-  for ( j = 0; j < block_size; j++ ) {
-    for(k = 0; k < block_size; k ++) {
-      block[j * block_size + k] =  coeff[j * block_size + k];
+  for ( j = 0; j < block_height; j++ ) {
+    for(k = 0; k < block_width; k ++) {
+      block[j * block_width + k] =  coeff[j * block_width + k];
     }
   }
 }
@@ -595,7 +595,7 @@ void uvg_chroma_transform_search(
           transforms[i] != JCCR_1 ? COLOR_U : COLOR_V, pred_cu);
       }
       else {
-        uvg_itransformskip(state->encoder_control, u_recon_resi, &u_coeff[i * trans_offset], width);
+        uvg_itransformskip(state->encoder_control, u_recon_resi, &u_coeff[i * trans_offset], width, height);
       }
       if (transforms[i] != JCCR_1) {
         for (int j = 0; j < width * height; j++) {
@@ -622,7 +622,7 @@ void uvg_chroma_transform_search(
           transforms[i] != JCCR_1 ? COLOR_U : COLOR_V, pred_cu);
       }
       else {
-        uvg_itransformskip(state->encoder_control, v_recon_resi, &v_coeff[i * trans_offset], width);
+        uvg_itransformskip(state->encoder_control, v_recon_resi, &v_coeff[i * trans_offset], width, height);
       }
       for (int j = 0; j < width * height; j++) {
         v_recon[trans_offset * i + j] = CLIP_TO_PIXEL(v_pred[j] + v_recon_resi[j]);
diff --git a/src/transform.h b/src/transform.h
index 0da34a12..f8f81da6 100644
--- a/src/transform.h
+++ b/src/transform.h
@@ -48,7 +48,7 @@ extern const int16_t uvg_g_inv_quant_scales[6];
 extern const int16_t uvg_g_quant_scales[6];
 
 void uvg_transformskip(const encoder_control_t *encoder, int16_t *block,int16_t *coeff, int8_t width, int8_t height);
-void uvg_itransformskip(const encoder_control_t *encoder, int16_t *block,int16_t *coeff, int8_t block_size);
+void uvg_itransformskip(const encoder_control_t *encoder, int16_t *block,int16_t *coeff, int8_t width, int8_t height);
 
 void uvg_transform2d(const encoder_control_t * const encoder,
                      int16_t *block,

From bcbd952dfde72d910fd4c8f3908b4e282866a55e Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Tue, 16 Aug 2022 16:37:12 +0300
Subject: [PATCH 029/254] [isp] Add height handling to avx2 reconstruction.

---
 src/strategies/avx2/quant-avx2.c       | 67 ++++++++++++++++++--------
 src/strategies/generic/quant-generic.c |  6 +--
 2 files changed, 50 insertions(+), 23 deletions(-)

diff --git a/src/strategies/avx2/quant-avx2.c b/src/strategies/avx2/quant-avx2.c
index 0a0df48c..078df533 100644
--- a/src/strategies/avx2/quant-avx2.c
+++ b/src/strategies/avx2/quant-avx2.c
@@ -582,33 +582,60 @@ static INLINE int64_t get_quantized_recon_8x1_avx2(int16_t *residual, const uint
   return _mm_cvtsi128_si64(_mm_packus_epi16(rec, rec));
 }
 
-static void get_quantized_recon_avx2(int16_t *residual, const uint8_t *pred_in, int in_stride, uint8_t *rec_out, int out_stride, int width){
+static void get_quantized_recon_avx2(int16_t *residual, const uint8_t *pred_in, int in_stride, uint8_t *rec_out, int out_stride, int width, int height){
 
-  switch (width) {
+  if (height == width || width >= 16) {
+    switch (width) {
     case 4:
-      *(int32_t*)&(rec_out[0 * out_stride]) = get_quantized_recon_4x1_avx2(residual + 0 * width, pred_in + 0 * in_stride);
-      *(int32_t*)&(rec_out[1 * out_stride]) = get_quantized_recon_4x1_avx2(residual + 1 * width, pred_in + 1 * in_stride);
-      *(int32_t*)&(rec_out[2 * out_stride]) = get_quantized_recon_4x1_avx2(residual + 2 * width, pred_in + 2 * in_stride);
-      *(int32_t*)&(rec_out[3 * out_stride]) = get_quantized_recon_4x1_avx2(residual + 3 * width, pred_in + 3 * in_stride);
+      *(int32_t*) & (rec_out[0 * out_stride]) = get_quantized_recon_4x1_avx2(residual + 0 * width, pred_in + 0 * in_stride);
+      *(int32_t*)& (rec_out[1 * out_stride]) = get_quantized_recon_4x1_avx2(residual + 1 * width, pred_in + 1 * in_stride);
+      *(int32_t*)& (rec_out[2 * out_stride]) = get_quantized_recon_4x1_avx2(residual + 2 * width, pred_in + 2 * in_stride);
+      *(int32_t*)& (rec_out[3 * out_stride]) = get_quantized_recon_4x1_avx2(residual + 3 * width, pred_in + 3 * in_stride);
       break;
     case 8:
-      *(int64_t*)&(rec_out[0 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 0 * width, pred_in + 0 * in_stride);
-      *(int64_t*)&(rec_out[1 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 1 * width, pred_in + 1 * in_stride);
-      *(int64_t*)&(rec_out[2 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 2 * width, pred_in + 2 * in_stride);
-      *(int64_t*)&(rec_out[3 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 3 * width, pred_in + 3 * in_stride);
-      *(int64_t*)&(rec_out[4 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 4 * width, pred_in + 4 * in_stride);
-      *(int64_t*)&(rec_out[5 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 5 * width, pred_in + 5 * in_stride);
-      *(int64_t*)&(rec_out[6 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 6 * width, pred_in + 6 * in_stride);
-      *(int64_t*)&(rec_out[7 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 7 * width, pred_in + 7 * in_stride);
+      *(int64_t*) & (rec_out[0 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 0 * width, pred_in + 0 * in_stride);
+      *(int64_t*)& (rec_out[1 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 1 * width, pred_in + 1 * in_stride);
+      *(int64_t*)& (rec_out[2 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 2 * width, pred_in + 2 * in_stride);
+      *(int64_t*)& (rec_out[3 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 3 * width, pred_in + 3 * in_stride);
+      *(int64_t*)& (rec_out[4 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 4 * width, pred_in + 4 * in_stride);
+      *(int64_t*)& (rec_out[5 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 5 * width, pred_in + 5 * in_stride);
+      *(int64_t*)& (rec_out[6 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 6 * width, pred_in + 6 * in_stride);
+      *(int64_t*)& (rec_out[7 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 7 * width, pred_in + 7 * in_stride);
       break;
     default:
-      for (int y = 0; y < width; ++y) {
+      for (int y = 0; y < height; ++y) {
         for (int x = 0; x < width; x += 16) {
-          *(int64_t*)&(rec_out[x + y * out_stride]) = get_quantized_recon_8x1_avx2(residual + x + y * width, pred_in + x + y  * in_stride);
-          *(int64_t*)&(rec_out[(x + 8) + y * out_stride]) = get_quantized_recon_8x1_avx2(residual + (x + 8) + y * width, pred_in + (x + 8) + y  * in_stride);
+          *(int64_t*)& (rec_out[x + y * out_stride]) = get_quantized_recon_8x1_avx2(residual + x + y * width, pred_in + x + y * in_stride);
+          *(int64_t*)& (rec_out[(x + 8) + y * out_stride]) = get_quantized_recon_8x1_avx2(residual + (x + 8) + y * width, pred_in + (x + 8) + y * in_stride);
         }
       }
       break;
+    }
+  }
+  else {
+    switch (width) {
+    case 4:
+      for (int y = 0; y < height; y += 4) {
+        *(int32_t*)& (rec_out[(y + 0) * out_stride]) = get_quantized_recon_4x1_avx2(residual + (y + 0) * width, pred_in + (y + 0) * in_stride);
+        *(int32_t*)& (rec_out[(y + 1) * out_stride]) = get_quantized_recon_4x1_avx2(residual + (y + 1) * width, pred_in + (y + 1) * in_stride);
+        *(int32_t*)& (rec_out[(y + 2) * out_stride]) = get_quantized_recon_4x1_avx2(residual + (y + 2) * width, pred_in + (y + 2) * in_stride);
+        *(int32_t*)& (rec_out[(y + 3) * out_stride]) = get_quantized_recon_4x1_avx2(residual + (y + 3) * width, pred_in + (y + 3) * in_stride);
+      }
+      break;
+    case 8:
+      for (int y = 0; y < height; ++y) {
+        *(int32_t*)& (rec_out[y * out_stride]) = get_quantized_recon_8x1_avx2(residual + y * width, pred_in + y * in_stride);
+      }
+      break;
+    default:
+      for (int y = 0; y < height; ++y) {
+        for (int x = 0; x < width; ++x) {
+          int16_t val = residual[x + y * width] + pred_in[x + y * in_stride];
+          rec_out[x + y * out_stride] = (uvg_pixel)CLIP(0, PIXEL_MAX, val);
+        }
+      }
+      break;
+    }
   }
 }
 
@@ -726,7 +753,7 @@ int uvg_quantize_residual_avx2(encoder_state_t *const state,
       int y, x;
       int sign, absval;
       int maxAbsclipBD = (1 << UVG_BIT_DEPTH) - 1;
-      for (y = 0; y < width; ++y) {
+      for (y = 0; y < height; ++y) {
         for (x = 0; x < width; ++x) {
           residual[x + y * width] = (int16_t)CLIP((int16_t)(-maxAbsclipBD - 1), (int16_t)maxAbsclipBD, residual[x + y * width]);
           sign = residual[x + y * width] >= 0 ? 1 : -1;
@@ -742,14 +769,14 @@ int uvg_quantize_residual_avx2(encoder_state_t *const state,
     }
 
     // Get quantized reconstruction. (residual + pred_in -> rec_out)
-    get_quantized_recon_avx2(residual, pred_in, in_stride, rec_out, out_stride, width);
+    get_quantized_recon_avx2(residual, pred_in, in_stride, rec_out, out_stride, width, height);
   }
   else if (rec_out != pred_in) {
     // With no coeffs and rec_out == pred_int we skip copying the coefficients
     // because the reconstruction is just the prediction.
     int y, x;
 
-    for (y = 0; y < width; ++y) {
+    for (y = 0; y < height; ++y) {
       for (x = 0; x < width; ++x) {
         rec_out[x + y * out_stride] = pred_in[x + y * in_stride];
       }
diff --git a/src/strategies/generic/quant-generic.c b/src/strategies/generic/quant-generic.c
index d08808cd..deb5c962 100644
--- a/src/strategies/generic/quant-generic.c
+++ b/src/strategies/generic/quant-generic.c
@@ -547,7 +547,7 @@ int uvg_quantize_residual_generic(encoder_state_t *const state,
       int y, x;
       int sign, absval;
       int maxAbsclipBD = (1 << UVG_BIT_DEPTH) - 1;
-      for (y = 0; y < width; ++y) {
+      for (y = 0; y < height; ++y) {
         for (x = 0; x < width; ++x) {
           residual[x + y * width] = (int16_t)CLIP((int16_t)(-maxAbsclipBD - 1), (int16_t)maxAbsclipBD, residual[x + y * width]);
           sign = residual[x + y * width] >= 0 ? 1 : -1;
@@ -563,7 +563,7 @@ int uvg_quantize_residual_generic(encoder_state_t *const state,
     }
 
     // Get quantized reconstruction. (residual + pred_in -> rec_out)
-    for (y = 0; y < width; ++y) {
+    for (y = 0; y < height; ++y) {
       for (x = 0; x < width; ++x) {
         int16_t val = residual[x + y * width] + pred_in[x + y * in_stride];
         rec_out[x + y * out_stride] = (uvg_pixel)CLIP(0, PIXEL_MAX, val);
@@ -575,7 +575,7 @@ int uvg_quantize_residual_generic(encoder_state_t *const state,
     // because the reconstruction is just the prediction.
     int y, x;
 
-    for (y = 0; y < width; ++y) {
+    for (y = 0; y < height; ++y) {
       for (x = 0; x < width; ++x) {
         rec_out[x + y * out_stride] = pred_in[x + y * in_stride];
       }

From 573ecf80e3da01cfa2e4cc8bf11b769b1bce7daa Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Wed, 17 Aug 2022 15:23:35 +0300
Subject: [PATCH 030/254] [isp] Move can_use_lfnst_with_isp to intra.c. Remove
 duplicate functions. Move isp related functions from search to intra. Make
 isp_split_dim static. Move isp related defines from search to intra.

---
 src/encode_coding_tree.c | 111 ++++++++++++---------------------------
 src/intra.c              |  92 +++++++++++++++++++++++++++++++-
 src/intra.h              |  11 ++++
 src/search.h             |   8 ---
 src/search_intra.c       |  75 +-------------------------
 5 files changed, 139 insertions(+), 158 deletions(-)

diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index 2199f5bc..8fb45396 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -102,63 +102,8 @@ static void encode_mts_idx(encoder_state_t * const state,
   }
 }
 
-// ISP_TODO: move these defines to a proper place when ISP is implemented
-// As of now, these are only needed in lfnst checks
-#define NOT_INTRA_SUBPARTITIONS 0
-#define HOR_INTRA_SUBPARTITIONS 1
-#define VER_INTRA_SUBPARTITIONS 2
-#define NUM_INTRA_SUBPARTITIONS_MODES 3
-#define INTRA_SUBPARTITIONS_RESERVED 4
-#define TU_1D_HOR_SPLIT 8
-#define TU_1D_VER_SPLIT 9
 
-#define MIN_TB_SIZE_X 4
-#define MIN_TB_SIZE_Y 4
-
-static int get_isp_split_dim(const int width, const int height, const int isp_split_type)
-{
-  bool divide_tu_in_rows = isp_split_type == TU_1D_HOR_SPLIT;
-  uint32_t split_dim_size, non_split_dim_size, partition_size, div_shift = 2;
-
-  if (divide_tu_in_rows)
-  {
-    split_dim_size = height;
-    non_split_dim_size = width;
-  }
-  else
-  {
-    split_dim_size = width;
-    non_split_dim_size = height;
-  }
-  
-  const unsigned min_num_samples_cu = 1 << ((uvg_math_floor_log2(MIN_TB_SIZE_Y) << 1));
-  const unsigned factor_to_min_samples = non_split_dim_size < min_num_samples_cu ? min_num_samples_cu >> uvg_math_floor_log2(non_split_dim_size) : 1;
-  partition_size = (split_dim_size >> div_shift) < factor_to_min_samples ? factor_to_min_samples : (split_dim_size >> div_shift);
-
-  assert(!(uvg_math_floor_log2(partition_size) + uvg_math_floor_log2(non_split_dim_size) < uvg_math_floor_log2(min_num_samples_cu)) && "Partition has less than minimum amount of samples.");
-  return partition_size;
-}
-
-static bool can_use_lfnst_with_isp(const int width, const int height, const int isp_split_type, const enum uvg_tree_type tree_type)
-{
-  if (tree_type == UVG_CHROMA_T) {
-    return false;
-  }
-  if (isp_split_type == NOT_INTRA_SUBPARTITIONS) {
-    return false;
-  }
-
-  const int tu_width = (isp_split_type == HOR_INTRA_SUBPARTITIONS) ? width : get_isp_split_dim(width, height, TU_1D_VER_SPLIT);
-  const int tu_height = (isp_split_type == HOR_INTRA_SUBPARTITIONS) ? get_isp_split_dim(width, height, TU_1D_HOR_SPLIT) : height;
-
-  if (!(tu_width >= MIN_TB_SIZE_Y && tu_height >= MIN_TB_SIZE_Y))
-  {
-    return false;
-  }
-  return true;
-}
-
- bool uvg_is_lfnst_allowed(
+bool uvg_is_lfnst_allowed(
   const encoder_state_t* const state,
   const cu_info_t* const pred_cu,
   const int width,
@@ -170,8 +115,7 @@ static bool can_use_lfnst_with_isp(const int width, const int height, const int
   const lcu_t* lcu) 
 {
   if (state->encoder_control->cfg.lfnst && pred_cu->type == CU_INTRA) {
-    const int isp_mode = 0; // ISP_TODO: assign proper ISP mode when ISP is implemented
-    const int isp_split_type = 0;
+    const int isp_mode = pred_cu->intra.isp_mode;
     const int depth = pred_cu->depth;
     const int chroma_width = width >> 1;
     const int chroma_height = height >> 1;
@@ -181,7 +125,7 @@ static bool can_use_lfnst_with_isp(const int width, const int height, const int
     bool is_sep_tree = depth == 4 || tree_type != UVG_BOTH_T;
     bool mip_flag = pred_cu->type == CU_INTRA && color == COLOR_Y ? pred_cu->intra.mip_flag : false;
 
-    if ((isp_mode && !can_use_lfnst_with_isp(width, height, isp_split_type, tree_type)) ||
+    if ((isp_mode && !uvg_can_use_isp_with_lfnst(width, height, isp_mode, tree_type)) ||
       (pred_cu->type == CU_INTRA && mip_flag && !can_use_lfnst_with_mip) || 
       (is_sep_tree && MIN(cu_width, cu_height) < 4) || 
       (cu_width > TR_MAX_WIDTH || cu_height > TR_MAX_WIDTH)) {
@@ -1064,13 +1008,6 @@ void uvg_encode_intra_luma_coding_unit(const encoder_state_t * const state,
   uint32_t width = (LCU_WIDTH >> depth);
   uint32_t height = (LCU_WIDTH >> depth);
 
-  // Need at least 16 samples in sub blocks to use isp. If both dimensions are 4, not enough samples. Size cannot be 2 at this point.
-  bool allow_isp = !(width == 4 && height == 4);
-  uint8_t isp_mode = 0;
-  // ToDo: add height comparison
-  //isp_mode += ((width > TR_MAX_WIDTH) || !enough_samples) ? 1 : 0;
-  //isp_mode += ((height > TR_MAX_WIDTH) || !enough_samples) ? 2 : 0;
-
   // Code MIP related bits
   bool enable_mip = state->encoder_control->cfg.mip;
   int8_t mip_flag = enable_mip ? cur_cu->intra.mip_flag : false;
@@ -1127,20 +1064,42 @@ void uvg_encode_intra_luma_coding_unit(const encoder_state_t * const state,
     }
   }
 
+  bool enable_isp = state->encoder_control->cfg.isp;
+  // Need at least 16 samples in sub blocks to use isp. If both dimensions are 4, not enough samples. Blocks of size 2 do not exist yet (not for luma at least)
+  bool allow_isp = enable_isp ? uvg_can_use_isp(width, height, 64 /*MAX_TR_SIZE*/) : false;
+  uint8_t isp_mode = allow_isp ? cur_cu->intra.isp_mode : 0;
 
-  // ToDo: update real usage, these if clauses as such don't make any sense
-  if (isp_mode != 0 && multi_ref_idx == 0) {
-    if (isp_mode) {
-      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.intra_subpart_model[0]),  0, bits, "intra_subPartitions");
-    } else {
-      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.intra_subpart_model[0]), 1, bits, "intra_subPartitions");
-      // ToDo: complete this if-clause
-      if (isp_mode == 3) {
-        CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.intra_subpart_model[0]), allow_isp - 1, bits, "intra_subPart_ver_hor");
-      }
+  // ToDo: add height comparison
+  //isp_mode += ((width > TR_MAX_WIDTH) || !enough_samples) ? 1 : 0;
+  //isp_mode += ((height > TR_MAX_WIDTH) || !enough_samples) ? 2 : 0;
+
+  if (allow_isp && !multi_ref_idx /*&& !bdpcm && !color_transform*/) {
+    if (isp_mode == ISP_MODE_NO_ISP) {
+      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.intra_subpart_model[0]), 0, bits, "intra_subpartitions_mode");
+    }
+    else {
+      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.intra_subpart_model[0]), 1, bits, "intra_subpartitions_mode");
+      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.intra_subpart_model[1]), isp_mode - 1, bits, "intra_subpartitions_split_type"); // Vertical or horizontal split
     }
   }
 
+  
+  //if (allow_isp && !multi_ref_idx /*&& !bdpcm && !color_transform*/) {
+  //  if (isp_mode == ISP_MODE_NO_ISP) {
+  //    if (isp_mode) {
+  //      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.intra_subpart_model[0]), 0, bits, "intra_subPartitions");
+  //    }
+  //    else {
+  //      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.intra_subpart_model[0]), 1, bits, "intra_subPartitions");
+  //      // ToDo: complete this if-clause
+  //      if (isp_mode == 3) {
+  //        CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.intra_subpart_model[0]), allow_isp - 1, bits, "intra_subPart_ver_hor");
+  //      }
+  //    }
+  //  }
+  //}
+  
+
   const int cu_width = LCU_WIDTH >> depth;
     // PREDINFO CODING
     // If intra prediction mode is found from the predictors,
diff --git a/src/intra.c b/src/intra.c
index ea848faa..1a1e3817 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -1471,6 +1471,41 @@ const cu_info_t* uvg_get_co_located_luma_cu(
 }
 
 
+/**
+* \brief Returns ISP split partition size based on block dimensions and split type.
+*
+* Returns ISP split partition size based on block dimensions and split type.
+* Will fail if resulting partition size has less than 16 samples.
+*
+* \param width        Block width.
+* \param height       Block height.
+* \param split_type   Horizontal or vertical split.
+*/
+static int get_isp_split_dim(const int width, const int height, const int split_type)
+{
+  bool divide_in_rows = split_type == SPLIT_TYPE_HOR;
+  int split_dim_size, non_split_dim_size, partition_size, div_shift = 2;
+
+  if (divide_in_rows) {
+    split_dim_size = height;
+    non_split_dim_size = width;
+  }
+  else {
+    split_dim_size = width;
+    non_split_dim_size = height;
+  }
+
+  // ISP_TODO: make a define for this. Depends on minimum transform block log2 side length
+  const int min_num_samples = 16; // Minimum allowed number of samples for split block
+  const int factor_to_min_samples = non_split_dim_size < min_num_samples ? min_num_samples >> uvg_math_floor_log2(non_split_dim_size) : 1;
+  partition_size = (split_dim_size >> div_shift) < factor_to_min_samples ? factor_to_min_samples : (split_dim_size >> div_shift);
+
+  assert((uvg_math_floor_log2(partition_size) + uvg_math_floor_log2(non_split_dim_size) < uvg_math_floor_log2(min_num_samples)) &&
+    "Partition has less than allowed minimum number of samples.");
+  return partition_size;
+}
+
+
 static void intra_recon_tb_leaf(
   encoder_state_t* const state,
   const cu_loc_t* cu_loc,
@@ -1631,7 +1666,7 @@ void uvg_intra_recon_cu(
     // ISP split is done horizontally or vertically depending on ISP mode, 2 or 4 times depending on block dimensions.
     // Small blocks are split only twice.
     int split_type = search_data->pred_cu.intra.isp_mode;
-    int part_dim = uvg_get_isp_split_dim(width, height, split_type);
+    int part_dim = get_isp_split_dim(width, height, split_type);
     int limit = split_type == ISP_MODE_HOR ? height : width;
     for (int part = 0; part < limit; part + part_dim) {
       const int part_x = split_type == ISP_MODE_HOR ? x : x + part;
@@ -1668,3 +1703,58 @@ void uvg_intra_recon_cu(
                             &loc, depth, cur_cu, lcu,
                             false, tree_type);
 }
+
+
+/**
+* \brief Check if ISP can be used for block size.
+*
+* \return True if isp can be used.
+* \param width        Block width.
+* \param height       Block height.
+* \param max_tr_size  Maximum supported transform block size (64).
+*/
+bool uvg_can_use_isp(const int width, const int height, const int max_tr_size)
+{
+  assert(!(width > LCU_WIDTH || height > LCU_WIDTH) && "Block size larger than max LCU size.");
+  assert(!(width < TR_MIN_WIDTH || height < TR_MIN_WIDTH) && "Block size smaller than min TR_WIDTH.");
+
+  const int log2_width = uvg_g_convert_to_log2[width];
+  const int log2_height = uvg_g_convert_to_log2[height];
+
+  // Each split block must have at least 16 samples.
+  bool not_enough_samples = (log2_width + log2_height <= 4);
+  bool cu_size_larger_than_max_tr_size = width > max_tr_size || height > max_tr_size;
+  if (not_enough_samples || cu_size_larger_than_max_tr_size) {
+    return false;
+  }
+  return true;
+}
+
+
+/**
+* \brief Check if given ISP mode can be used with LFNST.
+*
+* \return True if isp can be used.
+* \param width        Block width.
+* \param height       Block height.
+* \param isp_mode     ISP mode.
+* \param tree_type    Tree type. Dual, luma or chroma tree.
+*/
+bool uvg_can_use_isp_with_lfnst(const int width, const int height, const int isp_split_type, const enum uvg_tree_type tree_type)
+{
+  if (tree_type == UVG_CHROMA_T) {
+    return false;
+  }
+  if (isp_split_type == ISP_MODE_NO_ISP) {
+    return false;
+  }
+
+  const int tu_width = (isp_split_type == ISP_MODE_HOR) ? width : get_isp_split_dim(width, height, SPLIT_TYPE_VER);
+  const int tu_height = (isp_split_type == ISP_MODE_HOR) ? get_isp_split_dim(width, height, SPLIT_TYPE_HOR) : height;
+
+  if (!(tu_width >= TR_MIN_WIDTH && tu_height >= TR_MIN_WIDTH))
+  {
+    return false;
+  }
+  return true;
+}
diff --git a/src/intra.h b/src/intra.h
index 6c7a648e..75b969b3 100644
--- a/src/intra.h
+++ b/src/intra.h
@@ -159,3 +159,14 @@ const cu_info_t* uvg_get_co_located_luma_cu(
   enum uvg_tree_type tree_type);
 
 int uvg_get_mip_flag_context(int x, int y, int width, int height, const lcu_t* lcu, cu_array_t* const cu_a);
+
+// ISP related defines
+#define NUM_ISP_MODES 3
+#define ISP_MODE_NO_ISP 0
+#define ISP_MODE_HOR 1
+#define ISP_MODE_VER 2
+#define SPLIT_TYPE_HOR 1
+#define SPLIT_TYPE_VER 2
+
+bool uvg_can_use_isp(const int width, const int height, const int max_tr_size);
+bool uvg_can_use_isp_with_lfnst(const int width, const int height, const int isp_mode, const enum uvg_tree_type tree_type);
diff --git a/src/search.h b/src/search.h
index 2a5a6867..7566fb96 100644
--- a/src/search.h
+++ b/src/search.h
@@ -77,14 +77,6 @@ typedef struct unit_stats_map_t {
 #define NUM_MIP_MODES_FULL(width, height) (((width) == 4 && (height) == 4) ? 32 : ((width) == 4 || (height) == 4 || ((width) == 8 && (height) == 8) ? 16 : 12))
 #define NUM_MIP_MODES_HALF(width, height) (NUM_MIP_MODES_FULL((width), (height)) >> 1)
 
-// ISP related defines
-#define NUM_ISP_MODES 3
-#define ISP_MODE_NO_ISP 0
-#define ISP_MODE_HOR 1
-#define ISP_MODE_VER 2
-#define SPLIT_TYPE_HOR 1
-#define SPLIT_TYPE_VER 2
-
 void uvg_sort_modes(int8_t *__restrict modes, double *__restrict costs, uint8_t length);
 void uvg_sort_modes_intra_luma(int8_t *__restrict modes, int8_t *__restrict trafo, double *__restrict costs, uint8_t length);
 
diff --git a/src/search_intra.c b/src/search_intra.c
index f36708b7..71964022 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -257,77 +257,6 @@ static void derive_mts_constraints(cu_info_t *const pred_cu,
 }
 
 
-// ISP_TODO: move this function if it is used elsewhere
-static INLINE bool can_use_isp(const int width, const int height, const int max_tr_size)
-{
-  assert(!(width > LCU_WIDTH || height > LCU_WIDTH) && "Block size larger than max LCU size.");
-  assert(!(width < TR_MIN_WIDTH || height < TR_MIN_WIDTH) && "Block size smaller than min TR_WIDTH.");
-
-  const int log2_width =  uvg_g_convert_to_log2[width];
-  const int log2_height = uvg_g_convert_to_log2[height];
-
-  // Each split block must have at least 16 samples.
-  bool not_enough_samples = (log2_width + log2_height <= 4);
-  bool cu_size_larger_than_max_tr_size = width > max_tr_size || height > max_tr_size;
-  if (not_enough_samples || cu_size_larger_than_max_tr_size) {
-    return false;
-  }
-  return true;
-}
-
-
-/**
-* \brief Returns ISP split partition size based on block dimensions and split type.
-*
-* Returns ISP split partition size based on block dimensions and split type.
-* Will fail if resulting partition size has less than 16 samples. 
-*
-* \param width        Block width.
-* \param height       Block height.
-* \param split_type   Horizontal or vertical split.
-*/
-int uvg_get_isp_split_dim(const int width, const int height, const int split_type)
-{
-  bool divide_in_rows = split_type == SPLIT_TYPE_HOR;
-  int split_dim_size, non_split_dim_size, partition_size, div_shift = 2;
-
-  if (divide_in_rows) {
-    split_dim_size = height;
-    non_split_dim_size = width;
-  }
-  else {
-    split_dim_size = width;
-    non_split_dim_size = height;
-  }
-
-  // ISP_TODO: make a define for this. Depends on minimum transform block log2 side length
-  const int min_num_samples = 16; // Minimum allowed number of samples for split block
-  const int factor_to_min_samples = non_split_dim_size < min_num_samples ? min_num_samples >> uvg_math_floor_log2(non_split_dim_size) : 1;
-  partition_size = (split_dim_size >> div_shift) < factor_to_min_samples ? factor_to_min_samples : (split_dim_size >> div_shift);
-
-  assert((uvg_math_floor_log2(partition_size) + uvg_math_floor_log2(non_split_dim_size) < uvg_math_floor_log2(min_num_samples)) &&
-    "Partition has less than allowed minimum number of samples.");
-  return partition_size;
-}
-
-
-// ISP_TODO: move this function if it is used elsewhere
-static INLINE bool can_use_isp_with_lfnst(const int width, const int height, const int isp_mode)
-{
-  if (isp_mode == ISP_MODE_NO_ISP) {
-    return false;
-  }
-  const int tu_width = isp_mode == ISP_MODE_HOR ? width : uvg_get_isp_split_dim(width, height, SPLIT_TYPE_VER);
-  const int tu_height = isp_mode == ISP_MODE_HOR ? uvg_get_isp_split_dim(width, height, SPLIT_TYPE_HOR) : height;
-  const int min_tb_size = TR_MIN_WIDTH; 
-
-  if (!(tu_width >= min_tb_size && tu_height >= min_tb_size)) {
-    return false;
-  }
-  return true;
-}
-
-
 /**
 * \brief Perform search for best intra transform split configuration.
 *
@@ -445,7 +374,7 @@ static double search_intra_trdepth(
 
       if (pred_cu->lfnst_idx != 0) {
         // Cannot use ISP with LFNST for small blocks
-        pred_cu->intra.isp_mode = can_use_isp_with_lfnst(width, height, pred_cu->intra.isp_mode) ? pred_cu->intra.isp_mode : ISP_MODE_NO_ISP;
+        pred_cu->intra.isp_mode = uvg_can_use_isp_with_lfnst(width, height, pred_cu->intra.isp_mode, tree_type) ? pred_cu->intra.isp_mode : ISP_MODE_NO_ISP;
       }
 
       for (trafo = mts_start; trafo < num_transforms; trafo++) {
@@ -1465,7 +1394,7 @@ static int8_t search_intra_rdo(
   for (int mode = 0; mode < modes_to_check; mode++) {
     bool can_do_isp_search = search_data[mode].pred_cu.intra.mip_flag ? false: true; // Cannot use ISP with MIP
     can_do_isp_search = search_data[mode].pred_cu.intra.multi_ref_idx == 0 ? can_do_isp_search : false; // Cannot use ISP with MRL
-    int max_isp_modes = can_do_isp_search && can_use_isp(width, height, 64 /*MAX_TR_SIZE*/) && state->encoder_control->cfg.isp ? NUM_ISP_MODES : 1;
+    int max_isp_modes = can_do_isp_search && uvg_can_use_isp(width, height, 64 /*MAX_TR_SIZE*/) && state->encoder_control->cfg.isp ? NUM_ISP_MODES : 1;
 
     for (int isp_mode = 0; isp_mode < max_isp_modes; ++isp_mode) {
       search_data[mode].pred_cu.intra.isp_mode = isp_mode;

From 8d914ce849a87265d4a1a168e5a32d055d2b3dd7 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Wed, 17 Aug 2022 16:42:22 +0300
Subject: [PATCH 031/254] [isp] Implement coefficient encoding for isp splits.
 Make get_split_dim non static, it was needed elsewhere after all.

---
 src/encode_coding_tree.c | 47 ++++++++++++++++++++++++++++------------
 src/intra.c              | 12 +++++-----
 2 files changed, 40 insertions(+), 19 deletions(-)

diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index 8fb45396..019c1d03 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -525,7 +525,8 @@ static void encode_transform_unit(
   int depth,
   bool only_chroma,
   lcu_coeff_t* coeff,
-  enum uvg_tree_type tree_type)
+  enum uvg_tree_type tree_type,
+  bool last_split)
 {
   assert(depth >= 1 && depth <= MAX_PU_DEPTH);
 
@@ -586,7 +587,7 @@ static void encode_transform_unit(
 
   bool chroma_cbf_set = cbf_is_set(cur_pu->cbf, depth, COLOR_U) ||
                         cbf_is_set(cur_pu->cbf, depth, COLOR_V);
-  if (chroma_cbf_set || joint_chroma) {
+  if ((chroma_cbf_set || joint_chroma) && last_split) {
     //Need to drop const to get lfnst constraints
     encode_chroma_tu(state, x, y, depth, width_c, (cu_info_t*)cur_pu, &scan_idx, coeff, joint_chroma, tree_type);
   }
@@ -611,7 +612,8 @@ static void encode_transform_coeff(
   uint8_t parent_coeff_v,
   bool only_chroma,
   lcu_coeff_t* coeff,
-  enum uvg_tree_type tree_type)
+  enum uvg_tree_type tree_type,
+  bool last_split)                // Always true except when writing sub partition coeffs (ISP)
 {
   cabac_data_t * const cabac = &state->cabac;
   //const encoder_control_t *const ctrl = state->encoder_control;
@@ -642,8 +644,6 @@ static void encode_transform_coeff(
 
   int8_t split = (LCU_WIDTH >> depth > TR_MAX_WIDTH);
 
- 
-
   const int cb_flag_y = tree_type != UVG_CHROMA_T ? cbf_is_set(cur_pu->cbf, depth, COLOR_Y) : 0;
   const int cb_flag_u = tree_type != UVG_LUMA_T ?( cur_pu->joint_cb_cr ? (cur_pu->joint_cb_cr >> 1) & 1 : cbf_is_set(cur_cu->cbf, depth, COLOR_U)) : 0;
   const int cb_flag_v = tree_type != UVG_LUMA_T ? (cur_pu->joint_cb_cr ? cur_pu->joint_cb_cr & 1 : cbf_is_set(cur_cu->cbf, depth, COLOR_V)) : 0;
@@ -671,7 +671,7 @@ static void encode_transform_coeff(
   // - they have already been signaled to 0 previously
   // When they are not present they are inferred to be 0, except for size 4
   // when the flags from previous level are used.
-  if (state->encoder_control->chroma_format != UVG_CSP_400 && (depth != 4 || only_chroma) && tree_type != UVG_LUMA_T) {
+  if (state->encoder_control->chroma_format != UVG_CSP_400 && (depth != 4 || only_chroma) && tree_type != UVG_LUMA_T && last_split) {
     
     if (!split) {
       if (true) {
@@ -690,10 +690,10 @@ static void encode_transform_coeff(
     uint8_t offset = LCU_WIDTH >> (depth + 1);
     int x2 = x + offset;
     int y2 = y + offset;
-    encode_transform_coeff(state, x,  y,  depth + 1, tr_depth + 1, cb_flag_u, cb_flag_v, only_chroma, coeff, tree_type);
-    encode_transform_coeff(state, x2, y,  depth + 1, tr_depth + 1, cb_flag_u, cb_flag_v, only_chroma, coeff, tree_type);
-    encode_transform_coeff(state, x,  y2, depth + 1, tr_depth + 1, cb_flag_u, cb_flag_v, only_chroma, coeff, tree_type);
-    encode_transform_coeff(state, x2, y2, depth + 1, tr_depth + 1, cb_flag_u, cb_flag_v, only_chroma, coeff, tree_type);
+    encode_transform_coeff(state, x,  y,  depth + 1, tr_depth + 1, cb_flag_u, cb_flag_v, only_chroma, coeff, tree_type, true);
+    encode_transform_coeff(state, x2, y,  depth + 1, tr_depth + 1, cb_flag_u, cb_flag_v, only_chroma, coeff, tree_type, true);
+    encode_transform_coeff(state, x,  y2, depth + 1, tr_depth + 1, cb_flag_u, cb_flag_v, only_chroma, coeff, tree_type, true);
+    encode_transform_coeff(state, x2, y2, depth + 1, tr_depth + 1, cb_flag_u, cb_flag_v, only_chroma, coeff, tree_type, true);
     return;
   }
 
@@ -737,12 +737,13 @@ static void encode_transform_coeff(
         || (cb_flag_u && cb_flag_v)) 
       && (depth != 4 || only_chroma || tree_type == UVG_CHROMA_T) 
       && state->encoder_control->cfg.jccr
+      && last_split
       ) {
       assert(cur_pu->joint_cb_cr < 4 && "JointCbCr is in search state.");
       cabac->cur_ctx = &cabac->ctx.joint_cb_cr[cb_flag_u * 2 + cb_flag_v - 1];
       CABAC_BIN(cabac, cur_pu->joint_cb_cr != 0, "tu_joint_cbcr_residual_flag");
     }
-    encode_transform_unit(state, x, y, depth, only_chroma, coeff, tree_type);
+    encode_transform_unit(state, x, y, depth, only_chroma, coeff, tree_type, last_split);
   }
 }
 
@@ -1611,7 +1612,7 @@ void uvg_encode_coding_tree(
       // Code (possible) coeffs to bitstream
 
       if (cbf) {
-        encode_transform_coeff(state, x, y, depth, 0, 0, 0, 0, coeff, tree_type);
+        encode_transform_coeff(state, x, y, depth, 0, 0, 0, 0, coeff, tree_type, true);
       }
 
       encode_mts_idx(state, cabac, cur_cu);
@@ -1628,7 +1629,25 @@ void uvg_encode_coding_tree(
     }
 
     if (tree_type != UVG_CHROMA_T) {
-      encode_transform_coeff(state, x, y, depth, 0, 0, 0, 0, coeff, tree_type);
+      // Cycle through sub partitions if ISP enabled.
+      // ISP split is done horizontally or vertically depending on ISP mode, 2 or 4 times depending on block dimensions.
+      // Small blocks are split only twice.
+      int split_type = cur_cu->intra.isp_mode;
+
+      int part_dim = cu_width;
+      if (split_type != ISP_MODE_NO_ISP) {
+        part_dim = uvg_get_isp_split_dim(cu_width, cu_height, split_type);
+      }
+      int limit = split_type == ISP_MODE_HOR ? cu_height : cu_width;
+
+      for (int part = 0; part < limit; part += part_dim) {
+        const int part_x = split_type == ISP_MODE_HOR ? x : x + part;
+        const int part_y = split_type == ISP_MODE_HOR ? y + part : y;
+        
+        // Check if last split to write chroma
+        bool last_split = (part + part_dim) == limit;
+        encode_transform_coeff(state, part_x, part_y, depth, 0, 0, 0, 0, coeff, tree_type, last_split);
+      }
     }
 
     if (tree_type != UVG_CHROMA_T) {
@@ -1646,7 +1665,7 @@ void uvg_encode_coding_tree(
       tmp->violates_lfnst_constrained_luma = false;
       tmp->violates_lfnst_constrained_chroma = false;
       tmp->lfnst_last_scan_pos = false;
-      encode_transform_coeff(state, x, y, depth, 0, 0, 0, 1, coeff, tree_type);
+      encode_transform_coeff(state, x, y, depth, 0, 0, 0, 1, coeff, tree_type, true);
       // Write LFNST only once for single tree structure
       encode_lfnst_idx(state, cabac, tmp, x, y, depth, cu_width, cu_height, tree_type, COLOR_UV);
     }
diff --git a/src/intra.c b/src/intra.c
index 1a1e3817..e65acfea 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -1481,8 +1481,10 @@ const cu_info_t* uvg_get_co_located_luma_cu(
 * \param height       Block height.
 * \param split_type   Horizontal or vertical split.
 */
-static int get_isp_split_dim(const int width, const int height, const int split_type)
+int uvg_get_isp_split_dim(const int width, const int height, const int split_type)
 {
+  assert(split_type != ISP_MODE_NO_ISP && "Cannot calculate split dimension if no split type is set. Make sure this function is not called in this case.");
+
   bool divide_in_rows = split_type == SPLIT_TYPE_HOR;
   int split_dim_size, non_split_dim_size, partition_size, div_shift = 2;
 
@@ -1666,9 +1668,9 @@ void uvg_intra_recon_cu(
     // ISP split is done horizontally or vertically depending on ISP mode, 2 or 4 times depending on block dimensions.
     // Small blocks are split only twice.
     int split_type = search_data->pred_cu.intra.isp_mode;
-    int part_dim = get_isp_split_dim(width, height, split_type);
+    int part_dim = uvg_get_isp_split_dim(width, height, split_type);
     int limit = split_type == ISP_MODE_HOR ? height : width;
-    for (int part = 0; part < limit; part + part_dim) {
+    for (int part = 0; part < limit; part += part_dim) {
       const int part_x = split_type == ISP_MODE_HOR ? x : x + part;
       const int part_y = split_type == ISP_MODE_HOR ? y + part: y;
       const int part_w = split_type == ISP_MODE_HOR ? part_dim : width;
@@ -1749,8 +1751,8 @@ bool uvg_can_use_isp_with_lfnst(const int width, const int height, const int isp
     return false;
   }
 
-  const int tu_width = (isp_split_type == ISP_MODE_HOR) ? width : get_isp_split_dim(width, height, SPLIT_TYPE_VER);
-  const int tu_height = (isp_split_type == ISP_MODE_HOR) ? get_isp_split_dim(width, height, SPLIT_TYPE_HOR) : height;
+  const int tu_width = (isp_split_type == ISP_MODE_HOR) ? width : uvg_get_isp_split_dim(width, height, SPLIT_TYPE_VER);
+  const int tu_height = (isp_split_type == ISP_MODE_HOR) ? uvg_get_isp_split_dim(width, height, SPLIT_TYPE_HOR) : height;
 
   if (!(tu_width >= TR_MIN_WIDTH && tu_height >= TR_MIN_WIDTH))
   {

From 75175ee2e2cee6b3ae1f0f231c2799423b3660ad Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Thu, 18 Aug 2022 13:33:29 +0300
Subject: [PATCH 032/254] [isp] Fix isp search.

---
 src/search_intra.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/search_intra.c b/src/search_intra.c
index 71964022..04b700f8 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -1392,8 +1392,11 @@ static int8_t search_intra_rdo(
   const int height = width; // TODO: height for non-square blocks
   
   for (int mode = 0; mode < modes_to_check; mode++) {
-    bool can_do_isp_search = search_data[mode].pred_cu.intra.mip_flag ? false: true; // Cannot use ISP with MIP
+    bool can_do_isp_search = search_data[mode].pred_cu.intra.mip_flag ? false : true; // Cannot use ISP with MIP
     can_do_isp_search = search_data[mode].pred_cu.intra.multi_ref_idx == 0 ? can_do_isp_search : false; // Cannot use ISP with MRL
+    double best_isp_cost = MAX_DOUBLE;
+    double best_bits = MAX_DOUBLE;
+    int8_t best_isp_mode = -1;
     int max_isp_modes = can_do_isp_search && uvg_can_use_isp(width, height, 64 /*MAX_TR_SIZE*/) && state->encoder_control->cfg.isp ? NUM_ISP_MODES : 1;
 
     for (int isp_mode = 0; isp_mode < max_isp_modes; ++isp_mode) {
@@ -1405,11 +1408,19 @@ static int8_t search_intra_rdo(
 
       double mode_cost = search_intra_trdepth(state, x_px, y_px, depth, tr_depth, MAX_INT, &search_data[mode], lcu, tree_type);
       search_data[mode].cost += mode_cost;
+      if (search_data[mode].cost < best_isp_cost) {
+        best_isp_cost = search_data[mode].cost;
+        best_isp_mode = isp_mode;
+        best_bits = search_data[mode].bits;
+      }
       if (state->encoder_control->cfg.intra_rdo_et && !cbf_is_set_any(search_data[mode].pred_cu.cbf, depth)) {
         modes_to_check = mode + 1;
         break;
       }
     }
+    search_data[mode].cost = best_isp_cost;
+    search_data[mode].bits = best_bits;
+    search_data[mode].pred_cu.intra.isp_mode = best_isp_mode;
   }
 
   // Update order according to new costs

From 031a758d6c2f60563f52bffa79e77e7ce9881ab7 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Thu, 18 Aug 2022 14:03:53 +0300
Subject: [PATCH 033/254] [isp] Count isp cbfs.

---
 src/intra.c        |  6 ++++
 src/intra.h        |  2 ++
 src/search.c       | 85 ++++++++++++++++++++++++++++++----------------
 src/search.h       |  3 +-
 src/search_inter.c |  2 +-
 src/search_intra.c |  4 ++-
 src/transform.c    |  1 -
 7 files changed, 69 insertions(+), 34 deletions(-)

diff --git a/src/intra.c b/src/intra.c
index e65acfea..5712b40a 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -1665,12 +1665,15 @@ void uvg_intra_recon_cu(
     return;
   }
   if (search_data->pred_cu.intra.isp_mode != ISP_MODE_NO_ISP && recon_luma ) {
+    search_data->best_isp_cbfs = 0;
     // ISP split is done horizontally or vertically depending on ISP mode, 2 or 4 times depending on block dimensions.
     // Small blocks are split only twice.
     int split_type = search_data->pred_cu.intra.isp_mode;
     int part_dim = uvg_get_isp_split_dim(width, height, split_type);
     int limit = split_type == ISP_MODE_HOR ? height : width;
+    int split_num = 0;
     for (int part = 0; part < limit; part += part_dim) {
+      cbf_clear(&cur_cu->cbf, depth, COLOR_Y);
       const int part_x = split_type == ISP_MODE_HOR ? x : x + part;
       const int part_y = split_type == ISP_MODE_HOR ? y + part: y;
       const int part_w = split_type == ISP_MODE_HOR ? part_dim : width;
@@ -1683,6 +1686,8 @@ void uvg_intra_recon_cu(
       uvg_quantize_lcu_residual(state, true, false, false,
                                 &loc, depth, cur_cu, lcu,
                                 false, tree_type);
+      search_data->best_isp_cbfs |= cbf_is_set(cur_cu->cbf, depth, COLOR_Y) << (split_num++);
+   
     }
   }
   const bool has_luma = recon_luma && search_data->pred_cu.intra.isp_mode == ISP_MODE_NO_ISP;
@@ -1700,6 +1705,7 @@ void uvg_intra_recon_cu(
     intra_recon_tb_leaf(state, &loc, lcu, COLOR_V, search_data, tree_type);
   }
 
+  // TODO: not necessary to call if only luma and ISP is on
   uvg_quantize_lcu_residual(state, has_luma, has_chroma && !(search_data->pred_cu.joint_cb_cr & 3),
                             search_data->pred_cu.joint_cb_cr & 3 && state->encoder_control->cfg.jccr && has_chroma,
                             &loc, depth, cur_cu, lcu,
diff --git a/src/intra.h b/src/intra.h
index 75b969b3..6fee8f1f 100644
--- a/src/intra.h
+++ b/src/intra.h
@@ -71,6 +71,7 @@ typedef struct {
   double coeff_bits;
   double distortion;
   double lfnst_costs[3];
+  uint8_t best_isp_cbfs;
 } intra_search_data_t ;
 
 
@@ -168,5 +169,6 @@ int uvg_get_mip_flag_context(int x, int y, int width, int height, const lcu_t* l
 #define SPLIT_TYPE_HOR 1
 #define SPLIT_TYPE_VER 2
 
+int uvg_get_isp_split_dim(const int width, const int height, const int split_type);
 bool uvg_can_use_isp(const int width, const int height, const int max_tr_size);
 bool uvg_can_use_isp_with_lfnst(const int width, const int height, const int isp_mode, const enum uvg_tree_type tree_type);
diff --git a/src/search.c b/src/search.c
index 56e07b06..64dd263b 100644
--- a/src/search.c
+++ b/src/search.c
@@ -307,7 +307,8 @@ static void downsample_cclm_rec(encoder_state_t *state, int x, int y, int width,
 double uvg_cu_rd_cost_luma(const encoder_state_t *const state,
                            const int x_px, const int y_px, const int depth,
                            const cu_info_t *const pred_cu,
-                           lcu_t *const lcu)
+                           lcu_t *const lcu,
+                           uint8_t isp_cbf)
 {
   const int width = LCU_WIDTH >> depth;
   const int skip_residual_coding = pred_cu->skipped || (pred_cu->type != CU_INTRA && pred_cu->cbf == 0);
@@ -329,29 +330,40 @@ double uvg_cu_rd_cost_luma(const encoder_state_t *const state,
     int offset = width / 2;
     double sum = 0;
 
-    sum += uvg_cu_rd_cost_luma(state, x_px, y_px, depth + 1, pred_cu, lcu);
-    sum += uvg_cu_rd_cost_luma(state, x_px + offset, y_px, depth + 1, pred_cu, lcu);
-    sum += uvg_cu_rd_cost_luma(state, x_px, y_px + offset, depth + 1, pred_cu, lcu);
-    sum += uvg_cu_rd_cost_luma(state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu);
+    sum += uvg_cu_rd_cost_luma(state, x_px, y_px, depth + 1, pred_cu, lcu, isp_cbf);
+    sum += uvg_cu_rd_cost_luma(state, x_px + offset, y_px, depth + 1, pred_cu, lcu, isp_cbf);
+    sum += uvg_cu_rd_cost_luma(state, x_px, y_px + offset, depth + 1, pred_cu, lcu, isp_cbf);
+    sum += uvg_cu_rd_cost_luma(state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu, isp_cbf);
 
     return sum + tr_tree_bits * state->lambda;
   }
 
   // Add transform_tree cbf_luma bit cost.
-  const int is_tr_split = tr_cu->tr_depth - tr_cu->depth;
-  int is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_Y);
-  if (pred_cu->type == CU_INTRA ||
+  if (pred_cu->intra.isp_mode == ISP_MODE_NO_ISP) {
+    const int is_tr_split = tr_cu->tr_depth - tr_cu->depth;
+    int is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_Y);
+    if (pred_cu->type == CU_INTRA ||
       is_tr_split ||
       cbf_is_set(tr_cu->cbf, depth, COLOR_U) ||
       cbf_is_set(tr_cu->cbf, depth, COLOR_V))
-  {
-    cabac_ctx_t *ctx = &(cabac->ctx.qt_cbf_model_luma[0]);
+    {
+      cabac_ctx_t* ctx = &(cabac->ctx.qt_cbf_model_luma[0]);
 
-    CABAC_FBITS_UPDATE(cabac, ctx, is_set, tr_tree_bits, "cbf_y_search");
+      CABAC_FBITS_UPDATE(cabac, ctx, is_set, tr_tree_bits, "cbf_y_search");
+    }
+
+    if (is_set && state->encoder_control->cfg.trskip_enable && width <= (1 << state->encoder_control->cfg.trskip_max_size)) {
+      CABAC_FBITS_UPDATE(cabac, &cabac->ctx.transform_skip_model_luma, pred_cu->tr_idx == MTS_SKIP, tr_tree_bits, "transform_skip_flag");
+    }
   }
-
-  if (is_set && state->encoder_control->cfg.trskip_enable && width <= (1 << state->encoder_control->cfg.trskip_max_size)) {
-    CABAC_FBITS_UPDATE(cabac, &cabac->ctx.transform_skip_model_luma, pred_cu->tr_idx == MTS_SKIP, tr_tree_bits, "transform_skip_flag");
+  else {
+    cabac_ctx_t* ctx = &(cabac->ctx.qt_cbf_model_luma[0]);
+    // TODO: 8x4 CUs
+    for (int i = 0; i < 4; i++) {
+      if (i != 3 && isp_cbf != 0x8) {
+        CABAC_FBITS_UPDATE(cabac, ctx, (isp_cbf >> i) & 1, tr_tree_bits, "cbf_y_search");
+      }
+    }
   }
 
   // SSD between reconstruction and original
@@ -478,7 +490,8 @@ static double cu_rd_cost_tr_split_accurate(
   const int depth,
   const cu_info_t* const pred_cu,
   lcu_t* const lcu,
-  enum uvg_tree_type tree_type) {
+  enum uvg_tree_type tree_type,
+  uint8_t isp_cbf) {
   const int width = LCU_WIDTH >> depth;
 
   const int skip_residual_coding = pred_cu->skipped || (pred_cu->type != CU_INTRA && pred_cu->cbf == 0);
@@ -523,25 +536,37 @@ static double cu_rd_cost_tr_split_accurate(
     int offset = LCU_WIDTH >> (depth + 1);
     double sum = 0;
 
-    sum += cu_rd_cost_tr_split_accurate(state, x_px, y_px, depth + 1, pred_cu, lcu, tree_type);
-    sum += cu_rd_cost_tr_split_accurate(state, x_px + offset, y_px, depth + 1, pred_cu, lcu, tree_type);
-    sum += cu_rd_cost_tr_split_accurate(state, x_px, y_px + offset, depth + 1, pred_cu, lcu, tree_type);
-    sum += cu_rd_cost_tr_split_accurate(state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu, tree_type);
+    sum += cu_rd_cost_tr_split_accurate(state, x_px, y_px, depth + 1, pred_cu, lcu, tree_type, isp_cbf);
+    sum += cu_rd_cost_tr_split_accurate(state, x_px + offset, y_px, depth + 1, pred_cu, lcu, tree_type, isp_cbf);
+    sum += cu_rd_cost_tr_split_accurate(state, x_px, y_px + offset, depth + 1, pred_cu, lcu, tree_type, isp_cbf);
+    sum += cu_rd_cost_tr_split_accurate(state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu, tree_type, isp_cbf);
     return sum + tr_tree_bits * state->lambda;
   }
   const int cb_flag_y = cbf_is_set(tr_cu->cbf, depth, COLOR_Y) && tree_type != UVG_CHROMA_T;
 
+  const bool is_isp = !(pred_cu->type == CU_INTER || pred_cu->intra.isp_mode == ISP_MODE_NO_ISP);
   // Add transform_tree cbf_luma bit cost.
-  const int is_tr_split = depth - tr_cu->depth;
-  if ((pred_cu->type == CU_INTRA ||
-    is_tr_split ||
-    cb_flag_u ||
-    cb_flag_v) 
+  if (is_isp) {
+    const int is_tr_split = depth - tr_cu->depth;
+    if ((pred_cu->type == CU_INTRA ||
+      is_tr_split ||
+      cb_flag_u ||
+      cb_flag_v)
       && !skip_residual_coding && tree_type != UVG_CHROMA_T)
-  {
-    cabac_ctx_t* ctx = &(cabac->ctx.qt_cbf_model_luma[0]);
+    {
+      cabac_ctx_t* ctx = &(cabac->ctx.qt_cbf_model_luma[0]);
 
-    CABAC_FBITS_UPDATE(cabac, ctx, cb_flag_y, tr_tree_bits, "cbf_y_search");
+      CABAC_FBITS_UPDATE(cabac, ctx, cb_flag_y, tr_tree_bits, "cbf_y_search");
+    }
+  }
+  else {
+    cabac_ctx_t* ctx = &(cabac->ctx.qt_cbf_model_luma[0]);
+    // TODO: 8x4 CUs
+    for (int i = 0; i < 4; i++) {
+      if (i != 3 && isp_cbf != 0x8) {
+        CABAC_FBITS_UPDATE(cabac, ctx, (isp_cbf >> i) & 1, tr_tree_bits, "cbf_y_search");
+      }
+    }
   }
 
   if (cb_flag_y || cb_flag_u || cb_flag_v) {
@@ -563,7 +588,7 @@ static double cu_rd_cost_tr_split_accurate(
   }
   // Chroma transform skip enable/disable is non-normative, so we need to count the chroma
   // tr-skip bits even when we are never using it.
-  const bool can_use_tr_skip = state->encoder_control->cfg.trskip_enable && width <= (1 << state->encoder_control->cfg.trskip_max_size);
+  const bool can_use_tr_skip = state->encoder_control->cfg.trskip_enable && width <= (1 << state->encoder_control->cfg.trskip_max_size) && !is_isp;
 
   if(cb_flag_y){
     if (can_use_tr_skip) {
@@ -1144,7 +1169,7 @@ static double search_cu(
     
     cost = bits * state->lambda;
 
-    cost += cu_rd_cost_tr_split_accurate(state, x_local, y_local, depth, cur_cu, lcu, tree_type);
+    cost += cu_rd_cost_tr_split_accurate(state, x_local, y_local, depth, cur_cu, lcu, tree_type, 0);
     
     if (ctrl->cfg.zero_coeff_rdo && inter_zero_coeff_cost <= cost) {
       cost = inter_zero_coeff_cost;
@@ -1301,7 +1326,7 @@ static double search_cu(
         double mode_bits = calc_mode_bits(state, lcu, cur_cu, x, y, depth) + bits;
         cost += mode_bits * state->lambda;
 
-        cost += cu_rd_cost_tr_split_accurate(state, x_local, y_local, depth, cur_cu, lcu, tree_type);
+        cost += cu_rd_cost_tr_split_accurate(state, x_local, y_local, depth, cur_cu, lcu, tree_type, 0);
 
         memcpy(&post_seach_cabac, &state->search_cabac, sizeof(post_seach_cabac));
         memcpy(&state->search_cabac, &temp_cabac, sizeof(temp_cabac));
diff --git a/src/search.h b/src/search.h
index 7566fb96..1a013670 100644
--- a/src/search.h
+++ b/src/search.h
@@ -87,7 +87,8 @@ void uvg_search_lcu(encoder_state_t *state, int x, int y, const yuv_t *hor_buf,
 double uvg_cu_rd_cost_luma(const encoder_state_t *const state,
                            const int x_px, const int y_px, const int depth,
                            const cu_info_t *const pred_cu,
-                           lcu_t *const lcu);
+                           lcu_t *const lcu,
+                           uint8_t isp_cbf);
 double uvg_cu_rd_cost_chroma(const encoder_state_t *const state,
                              const int x_px, const int y_px, const int depth,
                              cu_info_t *const pred_cu,
diff --git a/src/search_inter.c b/src/search_inter.c
index ff511740..e2013c6a 100644
--- a/src/search_inter.c
+++ b/src/search_inter.c
@@ -2286,7 +2286,7 @@ void uvg_cu_cost_inter_rd2(encoder_state_t * const state,
   int cbf = cbf_is_set_any(cur_cu->cbf, depth);
   
   if(cbf) {
-    *inter_cost = uvg_cu_rd_cost_luma(state, x_px, y_px, depth, cur_cu, lcu);
+    *inter_cost = uvg_cu_rd_cost_luma(state, x_px, y_px, depth, cur_cu, lcu, 0);
     if (reconstruct_chroma) {
       if (cur_cu->depth != cur_cu->tr_depth || !state->encoder_control->cfg.jccr) {
         *inter_cost += uvg_cu_rd_cost_chroma(state, x_px, y_px, depth, cur_cu, lcu);
diff --git a/src/search_intra.c b/src/search_intra.c
index 04b700f8..900f9113 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -406,6 +406,7 @@ static double search_intra_trdepth(
           UVG_LUMA_T,
           true,
           false);
+        if (pred_cu->intra.isp_mode != ISP_MODE_NO_ISP && search_data->best_isp_cbfs == 0) continue;
 
         if (trafo != 0 && !cbf_is_set(pred_cu->cbf, depth, COLOR_Y)) continue;
         
@@ -445,7 +446,8 @@ static double search_intra_trdepth(
           lcu_px.y,
           depth,
           pred_cu,
-          lcu);
+          lcu,
+          search_data->best_isp_cbfs);
         double transform_bits = 0;
         if (state->encoder_control->cfg.lfnst && depth == pred_cu->tr_depth &&
             trafo != MTS_SKIP) {
diff --git a/src/transform.c b/src/transform.c
index 10031f7b..01f6289f 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -1269,7 +1269,6 @@ static void quantize_tr_residual(
     
   }
 
-  // ISP_TODO: when other ISP things work, ask Joose about this
   cbf_clear(&cur_pu->cbf, depth, color);
   if (has_coeffs) {
     cbf_set(&cur_pu->cbf, depth, color);

From ae0336fdfc31511680af371960ae7841f68ffaba Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Thu, 18 Aug 2022 15:07:22 +0300
Subject: [PATCH 034/254] [isp] Add non-square block handling to functions.

---
 src/context.c                                 | 10 +++----
 src/context.h                                 |  2 +-
 src/encode_coding_tree.c                      | 28 +++++++++++--------
 src/encode_coding_tree.h                      |  1 +
 src/rdo.c                                     | 12 +++++---
 src/rdo.h                                     |  1 +
 src/search.c                                  | 26 +++++++++--------
 src/strategies/avx2/encode_coding_tree-avx2.h |  1 +
 src/strategies/avx2/quant-avx2.c              |  5 ++--
 .../generic/encode_coding_tree-generic.c      | 10 +++----
 .../generic/encode_coding_tree-generic.h      |  1 +
 src/strategies/generic/quant-generic.c        |  5 ++--
 src/strategies/strategies-encode.h            |  1 +
 src/strategies/strategies-quant.h             |  2 +-
 src/transform.c                               |  2 ++
 15 files changed, 65 insertions(+), 42 deletions(-)

diff --git a/src/context.c b/src/context.c
index 31124b02..708b9da4 100644
--- a/src/context.c
+++ b/src/context.c
@@ -657,7 +657,7 @@ uint32_t uvg_context_get_sig_coeff_group_ts(uint32_t* sig_coeff_group_flag,
 * \returns context index for current scan position
 */
 uint32_t uvg_context_get_sig_ctx_idx_abs(const coeff_t* coeff, uint32_t pos_x, uint32_t pos_y,
-                                         uint32_t width, uint32_t height, int8_t type,
+                                         uint32_t width, uint32_t height, int8_t color,
                                          int32_t* temp_diag, int32_t* temp_sum)
 {
   const coeff_t* data = coeff + pos_x + pos_y * width;
@@ -687,7 +687,7 @@ uint32_t uvg_context_get_sig_ctx_idx_abs(const coeff_t* coeff, uint32_t pos_x, u
   }
 #undef UPDATE
   int ctx_ofs = MIN((sum_abs+1)>>1, 3) + (diag < 2 ? 4 : 0);
-  if (type == 0 /* Luma */)
+  if (color == COLOR_Y)
   {
     ctx_ofs += diag < 5 ? 4 : 0;
   }
@@ -815,7 +815,7 @@ unsigned uvg_lrg1_ctx_id_abs_ts(const coeff_t* coeff, int32_t pos_x, int32_t pos
 * \returns context go rice parameter
 */
 uint32_t uvg_abs_sum(const coeff_t* coeff, uint32_t pos_x, uint32_t pos_y,
-                             uint32_t height, uint32_t width, uint32_t baselevel)
+                             uint32_t width, uint32_t height, uint32_t baselevel)
 {
 #define UPDATE(x) sum+=abs(x)/*-(x?1:0)*/
 
@@ -857,8 +857,8 @@ uint32_t uvg_abs_sum(const coeff_t* coeff, uint32_t pos_x, uint32_t pos_y,
 * \returns context go rice parameter
 */
 uint32_t uvg_go_rice_par_abs(const coeff_t* coeff, uint32_t pos_x, uint32_t pos_y,
-  uint32_t height, uint32_t width, uint32_t baselevel)
+  uint32_t width, uint32_t height, uint32_t baselevel)
 {
-  uint32_t check = uvg_abs_sum(coeff, pos_x, pos_y, height, width, baselevel);
+  uint32_t check = uvg_abs_sum(coeff, pos_x, pos_y, width, height, baselevel);
   return  g_go_rice_pars[check];  
 }
\ No newline at end of file
diff --git a/src/context.h b/src/context.h
index 3f342409..f083e44c 100644
--- a/src/context.h
+++ b/src/context.h
@@ -66,7 +66,7 @@ uint32_t uvg_abs_sum(const coeff_t* coeff, uint32_t pos_x, uint32_t pos_y,
                      uint32_t height, uint32_t width, uint32_t baselevel);
 
 uint32_t uvg_go_rice_par_abs(const coeff_t* coeff, uint32_t pos_x, uint32_t pos_y,
-                             uint32_t height, uint32_t width, uint32_t baselevel);
+                             uint32_t width, uint32_t height, uint32_t baselevel);
 
 #define CNU 35
 #define DWS 8
diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index 019c1d03..f917b31d 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -213,6 +213,7 @@ void uvg_encode_ts_residual(encoder_state_t* const state,
   cabac_data_t* const cabac,
   const coeff_t* coeff,
   uint32_t width,
+  uint32_t height,
   uint8_t type,
   int8_t scan_mode,
   double* bits_out) 
@@ -227,8 +228,9 @@ void uvg_encode_ts_residual(encoder_state_t* const state,
 
   // CONSTANTS
 
-  const uint32_t log2_block_width = uvg_g_convert_to_log2[width];
-  const uint32_t log2_block_height = log2_block_width; // TODO: height
+  const uint32_t log2_block_width  = uvg_g_convert_to_log2[width];
+  const uint32_t log2_block_height = uvg_g_convert_to_log2[height];
+  // TODO: log2_cg_size is wrong if width != height
   const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_width][log2_block_width][0] + uvg_g_log2_sbb_size[log2_block_width][log2_block_width][1];
   const uint32_t* old_scan =    uvg_g_sig_last_scan[scan_mode][log2_block_width - 1];
   const uint32_t* old_scan_cg = g_sig_last_scan_cg[log2_block_width - 1][scan_mode];
@@ -243,13 +245,11 @@ void uvg_encode_ts_residual(encoder_state_t* const state,
 
   cabac->cur_ctx = base_coeff_group_ctx;
   
-  // ISP_TODO: height
-  int maxCtxBins = (width * width * 7) >> 2;
+  int maxCtxBins = (width * height * 7) >> 2;
   unsigned scan_cg_last = (unsigned )-1;
   //unsigned scan_pos_last = (unsigned )-1;
 
-  // ISP_TODO: height
-  for (i = 0; i < width * width; i++) {
+  for (i = 0; i < width * height; i++) {
     if (coeff[scan[i]]) {
       // ISP_DEBUG
       assert(old_scan[i] == scan[i] && "Old scan_cg differs from the new one.");
@@ -258,7 +258,8 @@ void uvg_encode_ts_residual(encoder_state_t* const state,
       sig_coeffgroup_flag[scan_cg[i >> log2_cg_size]] = 1;
     }
   }
-  scan_cg_last = (width * width - 1) >> log2_cg_size;
+  // TODO: this won't work with non-square blocks
+  scan_cg_last = (width * height - 1) >> log2_cg_size;
   const uint32_t cg_width = (MIN((uint8_t)32, width) >> (log2_cg_size / 2));
 
   bool no_sig_group_before_last = true;
@@ -481,6 +482,7 @@ static void encode_chroma_tu(
   enum
   uvg_tree_type tree_type)
 {
+  int height_c = width_c; // TODO: height for non-square blocks
   int x_local = ((x >> (tree_type != UVG_CHROMA_T)) & ~3) % LCU_WIDTH_C;
   int y_local = ((y >> (tree_type != UVG_CHROMA_T)) & ~3) % LCU_WIDTH_C;
   cabac_data_t* const cabac = &state->cabac;
@@ -496,7 +498,7 @@ static void encode_chroma_tu(
         // TODO: transform skip for chroma blocks
         CABAC_BIN(cabac, (cur_pu->tr_skip >> COLOR_U) & 1, "transform_skip_flag");
       }
-      uvg_encode_coeff_nxn(state, &state->cabac, coeff_u, width_c, COLOR_U, *scan_idx, cur_pu, NULL);
+      uvg_encode_coeff_nxn(state, &state->cabac, coeff_u, width_c, height_c, COLOR_U, *scan_idx, cur_pu, NULL);
     }
 
     if (cbf_is_set(cur_pu->cbf, depth, COLOR_V)) {
@@ -504,7 +506,7 @@ static void encode_chroma_tu(
         cabac->cur_ctx = &cabac->ctx.transform_skip_model_chroma;
         CABAC_BIN(cabac, (cur_pu->tr_skip >> COLOR_V) & 1, "transform_skip_flag");
       }
-      uvg_encode_coeff_nxn(state, &state->cabac, coeff_v, width_c, COLOR_V, *scan_idx, cur_pu, NULL);
+      uvg_encode_coeff_nxn(state, &state->cabac, coeff_v, width_c, height_c, COLOR_V, *scan_idx, cur_pu, NULL);
     }
   }
   else {
@@ -513,7 +515,7 @@ static void encode_chroma_tu(
       cabac->cur_ctx = &cabac->ctx.transform_skip_model_chroma;
       CABAC_BIN(cabac, 0, "transform_skip_flag");
     }
-    uvg_encode_coeff_nxn(state, &state->cabac, coeff_uv, width_c, COLOR_V, *scan_idx, cur_pu, NULL);
+    uvg_encode_coeff_nxn(state, &state->cabac, coeff_uv, width_c, height_c, COLOR_V, *scan_idx, cur_pu, NULL);
     
   }
 }
@@ -534,6 +536,9 @@ static void encode_transform_unit(
   cabac_data_t* const cabac = &state->cabac;
   const uint8_t width = LCU_WIDTH >> depth;
   const uint8_t width_c = (depth == MAX_PU_DEPTH ? width : width / 2);
+  // TODO: height for non-square blocks
+  const uint8_t height = width;
+  const uint8_t height_c = width_c;
 
   cu_array_t* used_cu_array = tree_type != UVG_CHROMA_T ? frame->cu_array : frame->chroma_cu_array;
   const cu_info_t *cur_pu = uvg_cu_array_at_const(used_cu_array, x, y);
@@ -556,13 +561,14 @@ static void encode_transform_unit(
       DBG_YUVIEW_VALUE(state->frame->poc, DBG_YUVIEW_TR_SKIP, x, y, width, width, (cur_pu->tr_idx == MTS_SKIP) ? 1 : 0);
     }
     if(cur_pu->tr_idx == MTS_SKIP) {
-      uvg_encode_ts_residual(state, cabac, coeff_y, width, 0, scan_idx, NULL);      
+      uvg_encode_ts_residual(state, cabac, coeff_y, width, height, 0, scan_idx, NULL);      
     }
     else {
       uvg_encode_coeff_nxn(state,
                            cabac,
                            coeff_y,
                            width,
+                           height,
                            0,
                            scan_idx,
                            (cu_info_t * )cur_pu,
diff --git a/src/encode_coding_tree.h b/src/encode_coding_tree.h
index c2cd39da..9757a327 100644
--- a/src/encode_coding_tree.h
+++ b/src/encode_coding_tree.h
@@ -64,6 +64,7 @@ void uvg_encode_ts_residual(encoder_state_t* const state,
   cabac_data_t* const cabac,
   const coeff_t* coeff,
   uint32_t width,
+  uint32_t height,
   uint8_t type,
   int8_t scan_mode,
   double* bits);
diff --git a/src/rdo.c b/src/rdo.c
index fc4052c4..9f5abd21 100644
--- a/src/rdo.c
+++ b/src/rdo.c
@@ -298,6 +298,7 @@ static INLINE double get_coeff_cabac_cost(
   const encoder_state_t * const state,
   const coeff_t *coeff,
   int32_t width,
+  int32_t height,
   color_t color,
   int8_t scan_mode,
   int8_t tr_skip,
@@ -305,7 +306,7 @@ static INLINE double get_coeff_cabac_cost(
 {
   // Make sure there are coeffs present
   bool found = false;
-  for (int i = 0; i < width*width; i++) {
+  for (int i = 0; i < width * height; i++) {
     if (coeff[i] != 0) {
       found = 1;
       break;
@@ -331,6 +332,7 @@ static INLINE double get_coeff_cabac_cost(
                          &cabac_copy,
                          coeff,
                          width,
+                         height,
                          color,
                          scan_mode,
                          cur_tu,                   
@@ -341,6 +343,7 @@ static INLINE double get_coeff_cabac_cost(
       &cabac_copy,
       coeff,
       width,
+      height,
       color,
       scan_mode,
       &bits);
@@ -392,6 +395,7 @@ double uvg_get_coeff_cost(
   const coeff_t *coeff,
   cu_info_t* cur_tu,
   int32_t width,
+  int32_t height,
   color_t color,
   int8_t scan_mode,
   int8_t tr_skip)
@@ -409,15 +413,15 @@ double uvg_get_coeff_cost(
       return UINT32_MAX; // Hush little compiler don't you cry, not really gonna return anything after assert(0)
     } else {
       uint64_t weights = uvg_fast_coeff_get_weights(state);
-      uint32_t fast_cost = uvg_fast_coeff_cost(coeff, width, weights);
+      uint32_t fast_cost = uvg_fast_coeff_cost(coeff, width, height, weights);
       if (check_accuracy) {
-        double ccc = get_coeff_cabac_cost(state, coeff, width, color, scan_mode, tr_skip, cur_tu);
+        double ccc = get_coeff_cabac_cost(state, coeff, width, height, color, scan_mode, tr_skip, cur_tu);
         save_accuracy(state->qp, ccc, fast_cost);
       }
       return fast_cost;
     }
   } else {
-    double ccc = get_coeff_cabac_cost(state, coeff, width, color, scan_mode, tr_skip, cur_tu);
+    double ccc = get_coeff_cabac_cost(state, coeff, width, height, color, scan_mode, tr_skip, cur_tu);
     if (save_cccs) {
       save_ccc(state->qp, coeff, width * width, ccc);
     }
diff --git a/src/rdo.h b/src/rdo.h
index 7f325cfd..88a6548b 100644
--- a/src/rdo.h
+++ b/src/rdo.h
@@ -74,6 +74,7 @@ double uvg_get_coeff_cost(
   const coeff_t *coeff,
   cu_info_t* cur_tu,
   int32_t width,
+  int32_t height,
   color_t color,
   int8_t scan_mode,
   int8_t tr_skip);
diff --git a/src/search.c b/src/search.c
index 64dd263b..ba2f79c9 100644
--- a/src/search.c
+++ b/src/search.c
@@ -310,7 +310,8 @@ double uvg_cu_rd_cost_luma(const encoder_state_t *const state,
                            lcu_t *const lcu,
                            uint8_t isp_cbf)
 {
-  const int width = LCU_WIDTH >> depth;
+  const int width  = LCU_WIDTH >> depth;
+  const int height = width; // TODO: height for non-square blocks
   const int skip_residual_coding = pred_cu->skipped || (pred_cu->type != CU_INTRA && pred_cu->cbf == 0);
   cabac_data_t* cabac = (cabac_data_t *)&state->search_cabac;
 
@@ -380,7 +381,7 @@ double uvg_cu_rd_cost_luma(const encoder_state_t *const state,
     int8_t luma_scan_mode = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode, depth);
     const coeff_t *coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, x_px, y_px)];
 
-    coeff_bits += uvg_get_coeff_cost(state, coeffs, NULL, width, 0, luma_scan_mode, pred_cu->tr_idx == MTS_SKIP);
+    coeff_bits += uvg_get_coeff_cost(state, coeffs, NULL, width, height, 0, luma_scan_mode, pred_cu->tr_idx == MTS_SKIP);
   }
 
   double bits = tr_tree_bits + coeff_bits;
@@ -394,7 +395,8 @@ double uvg_cu_rd_cost_chroma(const encoder_state_t *const state,
                              lcu_t *const lcu)
 {
   const vector2d_t lcu_px = { (x_px & ~7) / 2, (y_px & ~7) / 2 };
-  const int width = (depth < MAX_DEPTH) ? LCU_WIDTH >> (depth + 1) : LCU_WIDTH >> depth;
+  const int width  = (depth < MAX_DEPTH) ? LCU_WIDTH >> (depth + 1) : LCU_WIDTH >> depth;
+  const int height = width; // TODO: height for non-square blocks
   cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, x_px, y_px);
   const int skip_residual_coding = pred_cu->skipped || (pred_cu->type != CU_INTRA && pred_cu->cbf == 0);
 
@@ -468,11 +470,11 @@ double uvg_cu_rd_cost_chroma(const encoder_state_t *const state,
     const int index = xy_to_zorder(LCU_WIDTH_C, lcu_px.x, lcu_px.y);
 
     if((pred_cu->joint_cb_cr & 3) == 0){
-      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.u[index], NULL, width, 2, scan_order, 0);
-      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.v[index], NULL, width, 2, scan_order, 0);
+      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.u[index], NULL, width, height, 2, scan_order, 0);
+      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.v[index], NULL, width, height, 2, scan_order, 0);
     }
     else {
-      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.joint_uv[index], NULL, width, 2, scan_order, 0);
+      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.joint_uv[index], NULL, width, height, 2, scan_order, 0);
       
     }
   }
@@ -493,6 +495,7 @@ static double cu_rd_cost_tr_split_accurate(
   enum uvg_tree_type tree_type,
   uint8_t isp_cbf) {
   const int width = LCU_WIDTH >> depth;
+  const int height = width; // TODO: height for non-square blocks
 
   const int skip_residual_coding = pred_cu->skipped || (pred_cu->type != CU_INTRA && pred_cu->cbf == 0);
   // cur_cu is used for TU parameters.
@@ -597,7 +600,7 @@ static double cu_rd_cost_tr_split_accurate(
     int8_t luma_scan_mode = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode, depth);
     const coeff_t* coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, x_px, y_px)];
 
-    coeff_bits += uvg_get_coeff_cost(state, coeffs, tr_cu, width, 0, luma_scan_mode, tr_cu->tr_skip & 1);
+    coeff_bits += uvg_get_coeff_cost(state, coeffs, tr_cu, width, height, 0, luma_scan_mode, tr_cu->tr_skip & 1);
   }
 
   if(depth == 4 || tree_type == UVG_LUMA_T) {
@@ -624,7 +627,8 @@ static double cu_rd_cost_tr_split_accurate(
   unsigned chroma_ssd = 0;
   if(has_chroma) {
     const vector2d_t lcu_px = { (x_px >> (tree_type != UVG_CHROMA_T)) & ~3, (y_px >> (tree_type != UVG_CHROMA_T)) &~3  };
-    const int chroma_width = MAX(4, LCU_WIDTH >> (depth + 1));
+    const int chroma_width  = MAX(4, LCU_WIDTH >> (depth + 1));
+    const int chroma_height = chroma_width; // TODO: height for non-square blocks
     int8_t scan_order = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode_chroma, depth);
     const unsigned index = xy_to_zorder(LCU_WIDTH_C, lcu_px.x, lcu_px.y);
 
@@ -646,8 +650,8 @@ static double cu_rd_cost_tr_split_accurate(
       if(chroma_can_use_tr_skip && cb_flag_v) {
         CABAC_FBITS_UPDATE(cabac, &cabac->ctx.transform_skip_model_chroma, tr_cu->tr_skip & 4, tr_tree_bits, "transform_skip_flag");        
       }
-      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.u[index], tr_cu, chroma_width, COLOR_U, scan_order, tr_cu->tr_skip & 2);
-      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.v[index], tr_cu, chroma_width, COLOR_V, scan_order, tr_cu->tr_skip & 4);
+      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.u[index], tr_cu, chroma_width, chroma_height, COLOR_U, scan_order, tr_cu->tr_skip & 2);
+      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.v[index], tr_cu, chroma_width, chroma_height, COLOR_V, scan_order, tr_cu->tr_skip & 4);
       
     }
     else {
@@ -664,7 +668,7 @@ static double cu_rd_cost_tr_split_accurate(
       if (chroma_can_use_tr_skip) {
         CABAC_FBITS_UPDATE(cabac, &cabac->ctx.transform_skip_model_chroma, tr_cu->tr_skip & 2, tr_tree_bits, "transform_skip_flag");
       }
-      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.joint_uv[index], tr_cu, chroma_width, COLOR_U, scan_order, 0);
+      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.joint_uv[index], tr_cu, chroma_width, chroma_height, COLOR_U, scan_order, 0);
     }
   }
 
diff --git a/src/strategies/avx2/encode_coding_tree-avx2.h b/src/strategies/avx2/encode_coding_tree-avx2.h
index ae1845c8..9fc75c8a 100644
--- a/src/strategies/avx2/encode_coding_tree-avx2.h
+++ b/src/strategies/avx2/encode_coding_tree-avx2.h
@@ -45,6 +45,7 @@ void uvg_encode_coeff_nxn_avx2(encoder_state_t * const state,
                                cabac_data_t * const cabac,
                                const coeff_t *coeff,
                                uint8_t width,
+                               uint8_t height,
                                uint8_t type,
                                int8_t scan_mode,
                                int8_t tr_skip,
diff --git a/src/strategies/avx2/quant-avx2.c b/src/strategies/avx2/quant-avx2.c
index 078df533..962a671a 100644
--- a/src/strategies/avx2/quant-avx2.c
+++ b/src/strategies/avx2/quant-avx2.c
@@ -875,8 +875,9 @@ static uint32_t coeff_abs_sum_avx2(const coeff_t *coeffs, const size_t length)
   return parts[0] + parts[1] + parts[2] + parts[3];
 }
 
-static uint32_t fast_coeff_cost_avx2(const coeff_t *coeff, int32_t width, uint64_t weights)
+static uint32_t fast_coeff_cost_avx2(const coeff_t *coeff, int32_t width, int32_t height, uint64_t weights)
 {
+  assert((width == height) && "Non-square block handling not implemented for this function.");
   const __m256i zero           = _mm256_setzero_si256();
   const __m256i threes         = _mm256_set1_epi16(3);
   const __m256i negate_hibytes = _mm256_set1_epi16(0xff00);
@@ -893,7 +894,7 @@ static uint32_t fast_coeff_cost_avx2(const coeff_t *coeff, int32_t width, uint64
   __m256i wts_lo     = _mm256_broadcastsi128_si256(wts_lo_128);
   __m256i wts_hi     = _mm256_broadcastsi128_si256(wts_hi_128);
 
-  for (int i = 0; i < width * width; i += 32) {
+  for (int i = 0; i < width * height; i += 32) {
     __m256i curr_lo      = _mm256_loadu_si256 ((const __m256i *)(coeff + i));
     __m256i curr_abs_lo  = _mm256_abs_epi16   (curr_lo);
     __m256i curr_max3_lo = _mm256_min_epu16   (curr_abs_lo, threes);
diff --git a/src/strategies/generic/encode_coding_tree-generic.c b/src/strategies/generic/encode_coding_tree-generic.c
index 189334b5..21785501 100644
--- a/src/strategies/generic/encode_coding_tree-generic.c
+++ b/src/strategies/generic/encode_coding_tree-generic.c
@@ -55,6 +55,7 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
   cabac_data_t * const cabac,
   const coeff_t *coeff,
   uint8_t width,
+  uint8_t height,
   uint8_t color,
   int8_t scan_mode,
   cu_info_t* cur_cu,
@@ -75,7 +76,6 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
 
   // CONSTANTS
 
-  const int height = width; // TODO: height for non-square blocks.
   const uint32_t log2_block_width =  uvg_g_convert_to_log2[width];
   const uint32_t log2_block_height = uvg_g_convert_to_log2[height];
   const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_width][log2_block_width][0] + uvg_g_log2_sbb_size[log2_block_width][log2_block_width][1];
@@ -192,7 +192,7 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
 
         sig = (coeff[blk_pos] != 0) ? 1 : 0;
         if (num_non_zero || next_sig_pos != infer_sig_pos) {
-          ctx_sig = uvg_context_get_sig_ctx_idx_abs(coeff, pos_x, pos_y, width, width, color, &temp_diag, &temp_sum);
+          ctx_sig = uvg_context_get_sig_ctx_idx_abs(coeff, pos_x, pos_y, width, height, color, &temp_diag, &temp_sum);
           cabac_ctx_t* sig_ctx_luma = &(cabac->ctx.cu_sig_model_luma[MAX(0, (quant_state - 1))][ctx_sig]);
           cabac_ctx_t* sig_ctx_chroma = &(cabac->ctx.cu_sig_model_chroma[MAX(0, (quant_state - 1))][MIN(ctx_sig,7)]);
 
@@ -200,7 +200,7 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
           reg_bins--;
 
         } else if (next_sig_pos != scan_pos_last) {
-          ctx_sig = uvg_context_get_sig_ctx_idx_abs(coeff, pos_x, pos_y, width, width, color, &temp_diag, &temp_sum);
+          ctx_sig = uvg_context_get_sig_ctx_idx_abs(coeff, pos_x, pos_y, width, height, color, &temp_diag, &temp_sum);
         }
 
 
@@ -266,7 +266,7 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
         blk_pos = scan[scan_pos];
         pos_y = blk_pos / width;
         pos_x = blk_pos - (pos_y * width);
-        int32_t abs_sum = uvg_abs_sum(coeff, pos_x, pos_y, width, width, 4);
+        int32_t abs_sum = uvg_abs_sum(coeff, pos_x, pos_y, width, height, 4);
 
         rice_param = g_go_rice_pars[abs_sum];
         uint32_t second_pass_abs_coeff = abs(coeff[blk_pos]);
@@ -284,7 +284,7 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
         pos_y = blk_pos / width;
         pos_x = blk_pos - (pos_y * width);
         uint32_t coeff_abs = abs(coeff[blk_pos]);
-        int32_t abs_sum = uvg_abs_sum(coeff, pos_x, pos_y, width, width, 0);
+        int32_t abs_sum = uvg_abs_sum(coeff, pos_x, pos_y, width, height, 0);
         rice_param = g_go_rice_pars[abs_sum];        
         pos0 = ((quant_state<2)?1:2) << rice_param;
         uint32_t remainder = (coeff_abs == 0 ? pos0 : coeff_abs <= pos0 ? coeff_abs - 1 : coeff_abs);
diff --git a/src/strategies/generic/encode_coding_tree-generic.h b/src/strategies/generic/encode_coding_tree-generic.h
index 8cfe497d..bcf51f15 100644
--- a/src/strategies/generic/encode_coding_tree-generic.h
+++ b/src/strategies/generic/encode_coding_tree-generic.h
@@ -45,6 +45,7 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
                                   cabac_data_t * const cabac,
                                   const coeff_t *coeff,
                                   uint8_t width,
+                                  uint8_t height,
                                   uint8_t color,
                                   int8_t scan_mode,
                                   cu_info_t* cur_cu,
diff --git a/src/strategies/generic/quant-generic.c b/src/strategies/generic/quant-generic.c
index deb5c962..16fbce38 100644
--- a/src/strategies/generic/quant-generic.c
+++ b/src/strategies/generic/quant-generic.c
@@ -653,14 +653,15 @@ static INLINE void get_coeff_weights(uint64_t wts_packed, uint16_t *weights)
   weights[3] = (wts_packed >> 48) & 0xffff;
 }
 
-static uint32_t fast_coeff_cost_generic(const coeff_t *coeff, int32_t width, uint64_t weights)
+static uint32_t fast_coeff_cost_generic(const coeff_t *coeff, int32_t width, int32_t height, uint64_t weights)
 {
+  assert((width == height) && "Non-square block handling not implemented for this function.");
   uint32_t sum = 0;
   uint16_t weights_unpacked[4];
 
   get_coeff_weights(weights, weights_unpacked);
 
-  for (int32_t i = 0; i < width * width; i++) {
+  for (int32_t i = 0; i < width * height; i++) {
      int16_t curr = coeff[i];
     uint32_t curr_abs = abs(curr);
     if (curr_abs > 3) {
diff --git a/src/strategies/strategies-encode.h b/src/strategies/strategies-encode.h
index 8743a6ed..f503eb73 100644
--- a/src/strategies/strategies-encode.h
+++ b/src/strategies/strategies-encode.h
@@ -50,6 +50,7 @@ typedef unsigned (encode_coeff_nxn_func)(encoder_state_t * const state,
                                          cabac_data_t * const cabac,
                                          const coeff_t *coeff,
                                          uint8_t width,
+                                         uint8_t heigth,
                                          uint8_t color,
                                          int8_t scan_mode,
                                          cu_info_t* cur_cu,
diff --git a/src/strategies/strategies-quant.h b/src/strategies/strategies-quant.h
index 2920ed82..b0e75046 100644
--- a/src/strategies/strategies-quant.h
+++ b/src/strategies/strategies-quant.h
@@ -86,7 +86,7 @@ typedef unsigned (quant_residual_func)(encoder_state_t *const state,
 typedef unsigned (dequant_func)(const encoder_state_t * const state, coeff_t *q_coef, coeff_t *coef, int32_t width,
   int32_t height, color_t color, int8_t block_type, int8_t transform_skip);
 
-typedef uint32_t (fast_coeff_cost_func)(const coeff_t *coeff, int32_t width, uint64_t weights);
+typedef uint32_t (fast_coeff_cost_func)(const coeff_t *coeff, int32_t width, int32_t height, uint64_t weights);
 
 typedef uint32_t (coeff_abs_sum_func)(const coeff_t *coeffs, size_t length);
 
diff --git a/src/transform.c b/src/transform.c
index 01f6289f..4738f942 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -690,6 +690,7 @@ void uvg_chroma_transform_search(
         u_quant_coeff,
         pred_cu,
         width,
+        height,
         COLOR_U,
         scan_order,
         transforms[i] == CHROMA_TS);
@@ -706,6 +707,7 @@ void uvg_chroma_transform_search(
         v_quant_coeff,
         pred_cu,
         width,
+        height,
         COLOR_V,
         scan_order,
         transforms[i] == CHROMA_TS);

From f8641f74366fc371bdbb3719cd23ab8f9336e6dd Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Thu, 18 Aug 2022 15:22:17 +0300
Subject: [PATCH 035/254] [isp] Fix assert. Implement coef cost calculation for
 isp splits.

---
 src/intra.c  |  2 +-
 src/search.c | 46 +++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 42 insertions(+), 6 deletions(-)

diff --git a/src/intra.c b/src/intra.c
index 5712b40a..a638661c 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -1502,7 +1502,7 @@ int uvg_get_isp_split_dim(const int width, const int height, const int split_typ
   const int factor_to_min_samples = non_split_dim_size < min_num_samples ? min_num_samples >> uvg_math_floor_log2(non_split_dim_size) : 1;
   partition_size = (split_dim_size >> div_shift) < factor_to_min_samples ? factor_to_min_samples : (split_dim_size >> div_shift);
 
-  assert((uvg_math_floor_log2(partition_size) + uvg_math_floor_log2(non_split_dim_size) < uvg_math_floor_log2(min_num_samples)) &&
+  assert((uvg_math_floor_log2(partition_size) + uvg_math_floor_log2(non_split_dim_size) >= uvg_math_floor_log2(min_num_samples)) &&
     "Partition has less than allowed minimum number of samples.");
   return partition_size;
 }
diff --git a/src/search.c b/src/search.c
index ba2f79c9..f9b58439 100644
--- a/src/search.c
+++ b/src/search.c
@@ -340,7 +340,7 @@ double uvg_cu_rd_cost_luma(const encoder_state_t *const state,
   }
 
   // Add transform_tree cbf_luma bit cost.
-  if (pred_cu->intra.isp_mode == ISP_MODE_NO_ISP) {
+  if (pred_cu->type == CU_INTER || pred_cu->intra.isp_mode == ISP_MODE_NO_ISP) {
     const int is_tr_split = tr_cu->tr_depth - tr_cu->depth;
     int is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_Y);
     if (pred_cu->type == CU_INTRA ||
@@ -379,9 +379,27 @@ double uvg_cu_rd_cost_luma(const encoder_state_t *const state,
 
   if (!skip_residual_coding) {
     int8_t luma_scan_mode = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode, depth);
-    const coeff_t *coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, x_px, y_px)];
+    if (pred_cu->type == CU_INTER || pred_cu->intra.isp_mode == ISP_MODE_NO_ISP) {
+      const coeff_t* coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, x_px, y_px)];
 
-    coeff_bits += uvg_get_coeff_cost(state, coeffs, NULL, width, height, 0, luma_scan_mode, pred_cu->tr_idx == MTS_SKIP);
+      coeff_bits += uvg_get_coeff_cost(state, coeffs, NULL, width, height, 0, luma_scan_mode, pred_cu->tr_idx == MTS_SKIP);
+    }
+    else {
+      int split_type = pred_cu->intra.isp_mode;
+      int part_dim = uvg_get_isp_split_dim(width, height, split_type);
+      int limit = split_type == ISP_MODE_HOR ? height : width;
+      int split_num = 0;
+      for (int part = 0; part < limit; part += part_dim) {
+        const int part_x = split_type == ISP_MODE_HOR ? x_px : x_px + part;
+        const int part_y = split_type == ISP_MODE_HOR ? y_px + part : y_px;
+        const int part_w = split_type == ISP_MODE_HOR ? part_dim : width;
+        const int part_h = split_type == ISP_MODE_HOR ? height : part_dim;
+
+        const coeff_t* coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, part_x, part_y)];
+
+        coeff_bits += uvg_get_coeff_cost(state, coeffs, NULL, part_w, part_h, 0, luma_scan_mode, pred_cu->tr_idx == MTS_SKIP);
+      }
+    }
   }
 
   double bits = tr_tree_bits + coeff_bits;
@@ -598,9 +616,27 @@ static double cu_rd_cost_tr_split_accurate(
       CABAC_FBITS_UPDATE(cabac, &cabac->ctx.transform_skip_model_luma, tr_cu->tr_idx == MTS_SKIP, tr_tree_bits, "transform_skip_flag");
     }
     int8_t luma_scan_mode = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode, depth);
-    const coeff_t* coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, x_px, y_px)];
+    if (pred_cu->type == CU_INTER || pred_cu->intra.isp_mode == ISP_MODE_NO_ISP) {
+      const coeff_t* coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, x_px, y_px)];
 
-    coeff_bits += uvg_get_coeff_cost(state, coeffs, tr_cu, width, height, 0, luma_scan_mode, tr_cu->tr_skip & 1);
+      coeff_bits += uvg_get_coeff_cost(state, coeffs, NULL, width, height, 0, luma_scan_mode, pred_cu->tr_idx == MTS_SKIP);
+    }
+    else {
+      int split_type = pred_cu->intra.isp_mode;
+      int part_dim = uvg_get_isp_split_dim(width, height, split_type);
+      int limit = split_type == ISP_MODE_HOR ? height : width;
+      int split_num = 0;
+      for (int part = 0; part < limit; part += part_dim) {
+        const int part_x = split_type == ISP_MODE_HOR ? x_px : x_px + part;
+        const int part_y = split_type == ISP_MODE_HOR ? y_px + part : y_px;
+        const int part_w = split_type == ISP_MODE_HOR ? part_dim : width;
+        const int part_h = split_type == ISP_MODE_HOR ? height : part_dim;
+
+        const coeff_t* coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, part_x, part_y)];
+
+        coeff_bits += uvg_get_coeff_cost(state, coeffs, NULL, part_w, part_h, 0, luma_scan_mode, pred_cu->tr_idx == MTS_SKIP);
+      }
+    }
   }
 
   if(depth == 4 || tree_type == UVG_LUMA_T) {

From 182d0f4e66c1e83f905fc0c006ac3be1ddd6b339 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Thu, 18 Aug 2022 15:37:18 +0300
Subject: [PATCH 036/254] [isp] Remove old_scan tables and related asserts. Fix
 coefficient group indexing.

---
 src/encode_coding_tree.c                       |  8 +-------
 src/rdo.c                                      | 10 ----------
 src/search_intra.c                             |  5 -----
 src/strategies/avx2/quant-avx2.c               |  2 --
 .../generic/encode_coding_tree-generic.c       | 18 +++++++-----------
 src/strategies/generic/quant-generic.c         |  2 --
 6 files changed, 8 insertions(+), 37 deletions(-)

diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index f917b31d..dcc0edeb 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -232,9 +232,7 @@ void uvg_encode_ts_residual(encoder_state_t* const state,
   const uint32_t log2_block_height = uvg_g_convert_to_log2[height];
   // TODO: log2_cg_size is wrong if width != height
   const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_width][log2_block_width][0] + uvg_g_log2_sbb_size[log2_block_width][log2_block_width][1];
-  const uint32_t* old_scan =    uvg_g_sig_last_scan[scan_mode][log2_block_width - 1];
-  const uint32_t* old_scan_cg = g_sig_last_scan_cg[log2_block_width - 1][scan_mode];
-
+  
   const uint32_t* scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_mode, log2_block_width, log2_block_height);
   const uint32_t* scan_cg = uvg_get_scan_order_table(SCAN_GROUP_UNGROUPED, scan_mode, log2_block_width, log2_block_height);
 
@@ -251,10 +249,6 @@ void uvg_encode_ts_residual(encoder_state_t* const state,
 
   for (i = 0; i < width * height; i++) {
     if (coeff[scan[i]]) {
-      // ISP_DEBUG
-      assert(old_scan[i] == scan[i] && "Old scan_cg differs from the new one.");
-      assert(old_scan_cg[i >> log2_cg_size] == scan_cg[i >> log2_cg_size] && "Old scan_cg differs from the new one.");
-      //scan_pos_last = i;
       sig_coeffgroup_flag[scan_cg[i >> log2_cg_size]] = 1;
     }
   }
diff --git a/src/rdo.c b/src/rdo.c
index 9f5abd21..4f5422bd 100644
--- a/src/rdo.c
+++ b/src/rdo.c
@@ -1187,8 +1187,6 @@ int uvg_ts_rdoq(encoder_state_t* const state, coeff_t* src_coeff, coeff_t* dest_
 
   const coeff_t entropy_coding_maximum = (1 << max_log2_tr_dynamic_range) - 1;
 
-  const uint32_t* old_scan = uvg_g_sig_last_scan[scan_mode][log2_block_width - 1];
-  const uint32_t* old_scan_cg = g_sig_last_scan_cg[log2_block_width - 1][scan_mode];
   const uint32_t* scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_mode, log2_block_width, log2_block_height);
   const uint32_t* scan_cg = uvg_get_scan_order_table(SCAN_GROUP_UNGROUPED, scan_mode, log2_block_width, log2_block_height);
 
@@ -1216,9 +1214,6 @@ int uvg_ts_rdoq(encoder_state_t* const state, coeff_t* src_coeff, coeff_t* dest_
   for (uint32_t sbId = 0; sbId < cg_num; sbId++)
   {
     uint32_t cg_blkpos = scan_cg[sbId];
-    // ISP_DEBUG
-    assert(old_scan[sbId] == scan[sbId] && "Old scan_cg differs from the new one.");
-    assert(old_scan_cg[sbId] == scan_cg[sbId] && "Old scan_cg differs from the new one.");
 
     int no_coeff_coded = 0;
     base_cost = 0.0;
@@ -1435,9 +1430,6 @@ void uvg_rdoq(
   const uint32_t cg_width  = (MIN((uint8_t)TR_MAX_WIDTH, width) >> log2_cg_width);
   const uint32_t cg_height = (MIN((uint8_t)TR_MAX_WIDTH, height) >> log2_cg_height);
 
-  const uint32_t *old_scan = uvg_g_sig_last_scan[ scan_mode ][ log2_block_width - 1 ];
-  const uint32_t *old_scan_cg = g_sig_last_scan_cg[log2_block_width - 1][scan_mode];
-  
   const uint32_t *scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_mode, log2_block_width, log2_block_height);
   const uint32_t *scan_cg = uvg_get_scan_order_table(SCAN_GROUP_UNGROUPED, scan_mode, log2_block_width, log2_block_height);
 
@@ -1487,7 +1479,6 @@ void uvg_rdoq(
     for (int32_t scanpos_in_cg = (cg_size - 1); scanpos_in_cg >= 0; scanpos_in_cg--)
     {
       int32_t  scanpos        = cg_scanpos*cg_size + scanpos_in_cg;
-      assert(old_scan[scanpos] == scan[scanpos] && "Scan index differs from old system.");
 
       if (lfnst_idx > 0 && scanpos > max_lfnst_pos) break;
       uint32_t blkpos         = scan[scanpos];
@@ -1523,7 +1514,6 @@ void uvg_rdoq(
   int32_t last_x_bits[32], last_y_bits[32];
 
   for (int32_t cg_scanpos = cg_last_scanpos; cg_scanpos >= 0; cg_scanpos--) {
-    assert(old_scan_cg[cg_scanpos] == scan_cg[cg_scanpos] && "Scan cg index differs from old system.");
     uint32_t cg_blkpos  = scan_cg[cg_scanpos];
     uint32_t cg_pos_y   = cg_blkpos / num_blk_side;
     uint32_t cg_pos_x   = cg_blkpos - (cg_pos_y * num_blk_side);
diff --git a/src/search_intra.c b/src/search_intra.c
index 900f9113..ee06077b 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -204,8 +204,6 @@ static void derive_mts_constraints(cu_info_t *const pred_cu,
   const uint32_t log2_block_height = uvg_g_convert_to_log2[height];
   const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_width][log2_block_width][0]
     + uvg_g_log2_sbb_size[log2_block_width][log2_block_width][1]; // ISP_TODO: height
-  const uint32_t *old_scan = uvg_g_sig_last_scan[scan_idx][log2_block_width - 1];
-  const uint32_t *old_scan_cg = g_sig_last_scan_cg[log2_block_width - 1][scan_idx];
   const uint32_t *scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_idx, log2_block_width, log2_block_height);
   const uint32_t *scan_cg = uvg_get_scan_order_table(SCAN_GROUP_UNGROUPED, scan_idx, log2_block_width, log2_block_height);
 
@@ -217,9 +215,6 @@ static void derive_mts_constraints(cu_info_t *const pred_cu,
   // ISP_TODO: height
   for (int i = 0; i < width * width; i++) {
     if (coeff[scan[i]]) {
-      // ISP_DEBUG
-      assert(old_scan[i] == scan[i] && "Old scan_cg differs from the new one.");
-      assert(old_scan_cg[i >> log2_cg_size] == scan_cg[i >> log2_cg_size] && "Old scan_cg differs from the new one.");
       scan_pos_last = i;
       sig_coeffgroup_flag[scan_cg[i >> log2_cg_size]] = 1;
     }
diff --git a/src/strategies/avx2/quant-avx2.c b/src/strategies/avx2/quant-avx2.c
index 962a671a..75a3ff8f 100644
--- a/src/strategies/avx2/quant-avx2.c
+++ b/src/strategies/avx2/quant-avx2.c
@@ -380,8 +380,6 @@ void uvg_quant_avx2(const encoder_state_t * const state, const coeff_t * __restr
   int32_t height, color_t color, int8_t scan_idx, int8_t block_type, int8_t transform_skip, uint8_t lfnst_idx)
 {
   const encoder_control_t * const encoder = state->encoder_control;
-  //const uint32_t log2_block_size = uvg_g_convert_to_bit[width] + 2;
-  //const uint32_t * const old_scan = uvg_g_sig_last_scan[scan_idx][log2_block_size - 1];
   const uint32_t log2_tr_width  = uvg_g_convert_to_log2[width];
   const uint32_t log2_tr_height = uvg_g_convert_to_log2[height];
   const uint32_t* const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_idx, log2_tr_width, log2_tr_height);
diff --git a/src/strategies/generic/encode_coding_tree-generic.c b/src/strategies/generic/encode_coding_tree-generic.c
index 21785501..145bafc4 100644
--- a/src/strategies/generic/encode_coding_tree-generic.c
+++ b/src/strategies/generic/encode_coding_tree-generic.c
@@ -78,9 +78,8 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
 
   const uint32_t log2_block_width =  uvg_g_convert_to_log2[width];
   const uint32_t log2_block_height = uvg_g_convert_to_log2[height];
-  const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_width][log2_block_width][0] + uvg_g_log2_sbb_size[log2_block_width][log2_block_width][1];
-  const uint32_t *old_scan = uvg_g_sig_last_scan[scan_mode][log2_block_width - 1];
-  const uint32_t *old_scan_cg = g_sig_last_scan_cg[log2_block_width - 1][scan_mode];
+  
+  const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_width][log2_block_height][0] + uvg_g_log2_sbb_size[log2_block_width][log2_block_height][1];
   const uint32_t* scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_mode, log2_block_width, log2_block_height);
   const uint32_t* scan_cg = uvg_get_scan_order_table(SCAN_GROUP_UNGROUPED, scan_mode, log2_block_width, log2_block_height);
 
@@ -92,11 +91,8 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
   unsigned scan_cg_last = (unsigned)-1;
   unsigned scan_pos_last = (unsigned)-1;
 
-  for (int i = 0; i < width * width; i++) {
+  for (int i = 0; i < width * height; i++) {
     if (coeff[scan[i]]) {
-      // ISP_DEBUG
-      assert(old_scan[i] == scan[i] && "Old scan_cg differs from the new one.");
-      assert(old_scan_cg[i >> log2_cg_size] == scan_cg[i >> log2_cg_size] && "Old scan_cg differs from the new one.");
       scan_pos_last = i;
       sig_coeffgroup_flag[scan_cg[i >> log2_cg_size]] = 1;
     }
@@ -144,14 +140,14 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
   for (i = scan_cg_last; i >= 0; i--) {
 
     //int32_t abs_coeff[64*64];
-    int32_t cg_blk_pos = scan_cg[i];
-    int32_t cg_pos_y = cg_blk_pos / (MIN((uint8_t)32, width) >> (log2_cg_size / 2));
-    int32_t cg_pos_x = cg_blk_pos - (cg_pos_y * (MIN((uint8_t)32, width) >> (log2_cg_size / 2)));
-
     const uint32_t log2_cg_width = uvg_g_log2_sbb_size[log2_block_width][log2_block_height][0];
     const uint32_t log2_cg_height = uvg_g_log2_sbb_size[log2_block_width][log2_block_height][1];
     const uint32_t cg_width = (MIN((uint8_t)TR_MAX_WIDTH, width) >> log2_cg_width);
     const uint32_t cg_height = (MIN((uint8_t)TR_MAX_WIDTH, height) >> log2_cg_height);
+    int32_t cg_blk_pos = scan_cg[i];
+    int32_t cg_pos_y = cg_blk_pos / (MIN((uint8_t)32, width) >> log2_cg_width);
+    int32_t cg_pos_x = cg_blk_pos - (cg_pos_y * (MIN((uint8_t)32, width) >> log2_cg_width));
+
 
     // !!! residual_coding_subblock() !!!
 
diff --git a/src/strategies/generic/quant-generic.c b/src/strategies/generic/quant-generic.c
index 16fbce38..e0c2744c 100644
--- a/src/strategies/generic/quant-generic.c
+++ b/src/strategies/generic/quant-generic.c
@@ -64,8 +64,6 @@ void uvg_quant_generic(
   const encoder_control_t * const encoder = state->encoder_control;
   const uint32_t log2_tr_width  = uvg_g_convert_to_log2[width];
   const uint32_t log2_tr_height = uvg_g_convert_to_log2[height];
-  //const uint32_t log2_block_size = uvg_g_convert_to_bit[width] + 2;
-  //const uint32_t * const old_scan = uvg_g_sig_last_scan[scan_idx][log2_block_size - 1];
   const uint32_t * const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_idx, log2_tr_width, log2_tr_height);
 
   int32_t qp_scaled = uvg_get_scaled_qp(color, state->qp, (encoder->bitdepth - 8) * 6, encoder->qp_map[0]);

From 93317cafa42be246a043ebc89763a285507d30aa Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Thu, 18 Aug 2022 15:55:23 +0300
Subject: [PATCH 037/254] [isp] Write isp config bit to sps.

---
 src/encoder_state-bitstream.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/encoder_state-bitstream.c b/src/encoder_state-bitstream.c
index 832969fc..ba0d32f6 100644
--- a/src/encoder_state-bitstream.c
+++ b/src/encoder_state-bitstream.c
@@ -665,7 +665,7 @@ static void encoder_state_write_bitstream_seq_parameter_set(bitstream_t* stream,
 
   WRITE_UE(stream, encoder->cfg.log2_parallel_merge_level-2, "log2_parallel_merge_level_minus2");
 
-  WRITE_U(stream, 0, 1, "sps_isp_enabled_flag");
+  WRITE_U(stream, encoder->cfg.isp, 1, "sps_isp_enabled_flag");
   
   if (state->encoder_control->cfg.mrl) {
     WRITE_U(stream, 1, 1, "sps_mrl_enabled_flag");

From 7062697beb366249e79a0ce20207a39e0161ad70 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Fri, 19 Aug 2022 15:05:22 +0300
Subject: [PATCH 038/254] [isp] Resolve TODOs. Make scan order tables const.

---
 src/encode_coding_tree.c                      |  6 +--
 src/intra.c                                   |  2 -
 src/rdo.c                                     | 24 +++++-----
 src/search_intra.c                            | 19 ++++----
 src/strategies/avx2/dct-avx2.c                |  1 -
 src/strategies/avx2/quant-avx2.c              |  1 -
 src/strategies/generic/dct-generic.c          |  6 ---
 .../generic/encode_coding_tree-generic.c      |  4 +-
 src/strategies/generic/intra-generic.c        | 45 +------------------
 src/strategies/generic/quant-generic.c        |  4 +-
 src/tables.c                                  |  7 ++-
 src/tables.h                                  |  2 +-
 src/transform.c                               |  1 -
 13 files changed, 32 insertions(+), 90 deletions(-)

diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index dcc0edeb..1b360926 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -58,7 +58,7 @@ bool uvg_is_mts_allowed(const encoder_state_t * const state, cu_info_t *const pr
   uint8_t mts_type = state->encoder_control->cfg.mts;
   bool mts_allowed = mts_type == UVG_MTS_BOTH || (pred_cu->type == CU_INTRA ? mts_type == UVG_MTS_INTRA : pred_cu->type == CU_INTER && mts_type == UVG_MTS_INTER);
   mts_allowed &= cu_width <= max_size && cu_height <= max_size;
-  //mts_allowed &= !cu.ispMode; // ISP_TODO: Uncomment this when ISP is implemented.
+  mts_allowed &= pred_cu->type == CU_INTRA ? !pred_cu->intra.isp_mode : true;
   //mts_allowed &= !cu.sbtInfo;
   mts_allowed &= !(pred_cu->bdpcmMode && cu_width <= ts_max_size && cu_height <= ts_max_size);
   mts_allowed &= pred_cu->tr_idx != MTS_SKIP && !pred_cu->violates_mts_coeff_constraint && pred_cu->mts_last_scan_pos ;
@@ -233,8 +233,8 @@ void uvg_encode_ts_residual(encoder_state_t* const state,
   // TODO: log2_cg_size is wrong if width != height
   const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_width][log2_block_width][0] + uvg_g_log2_sbb_size[log2_block_width][log2_block_width][1];
   
-  const uint32_t* scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_mode, log2_block_width, log2_block_height);
-  const uint32_t* scan_cg = uvg_get_scan_order_table(SCAN_GROUP_UNGROUPED, scan_mode, log2_block_width, log2_block_height);
+  const uint32_t* const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_mode, log2_block_width, log2_block_height);
+  const uint32_t* const scan_cg = uvg_get_scan_order_table(SCAN_GROUP_UNGROUPED, scan_mode, log2_block_width, log2_block_height);
 
   double bits = 0;
 
diff --git a/src/intra.c b/src/intra.c
index a638661c..f0b79d2e 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -1497,7 +1497,6 @@ int uvg_get_isp_split_dim(const int width, const int height, const int split_typ
     non_split_dim_size = height;
   }
 
-  // ISP_TODO: make a define for this. Depends on minimum transform block log2 side length
   const int min_num_samples = 16; // Minimum allowed number of samples for split block
   const int factor_to_min_samples = non_split_dim_size < min_num_samples ? min_num_samples >> uvg_math_floor_log2(non_split_dim_size) : 1;
   partition_size = (split_dim_size >> div_shift) < factor_to_min_samples ? factor_to_min_samples : (split_dim_size >> div_shift);
@@ -1654,7 +1653,6 @@ void uvg_intra_recon_cu(
       LCU_GET_CU_AT_PX(lcu, (lcu_px.x + offset) >> (tree_type == UVG_CHROMA_T), (lcu_px.y + offset) >> (tree_type == UVG_CHROMA_T))->cbf,
     };
 
-    // ISP_TODO: does not work with ISP yet, ask Joose when this is relevant.
     if (recon_luma && depth <= MAX_DEPTH) {
       cbf_set_conditionally(&cur_cu->cbf, child_cbfs, depth, COLOR_Y);
     }
diff --git a/src/rdo.c b/src/rdo.c
index 4f5422bd..160cc0bc 100644
--- a/src/rdo.c
+++ b/src/rdo.c
@@ -1187,8 +1187,8 @@ int uvg_ts_rdoq(encoder_state_t* const state, coeff_t* src_coeff, coeff_t* dest_
 
   const coeff_t entropy_coding_maximum = (1 << max_log2_tr_dynamic_range) - 1;
 
-  const uint32_t* scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_mode, log2_block_width, log2_block_height);
-  const uint32_t* scan_cg = uvg_get_scan_order_table(SCAN_GROUP_UNGROUPED, scan_mode, log2_block_width, log2_block_height);
+  const uint32_t* const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_mode, log2_block_width, log2_block_height);
+  const uint32_t* const scan_cg = uvg_get_scan_order_table(SCAN_GROUP_UNGROUPED, scan_mode, log2_block_width, log2_block_height);
 
   uint32_t coeff_levels[3];
   double   coeff_level_error[4];
@@ -1391,14 +1391,13 @@ void uvg_rdoq(
 {
   const encoder_control_t * const encoder = state->encoder_control;
   cabac_data_t * const cabac = &state->cabac;
-  // ISP_TODO: these dimensions can be removed, they are same as log2_block_dimensions
-  uint32_t log2_tr_width      = uvg_math_floor_log2(width);
-  uint32_t log2_tr_height      = uvg_math_floor_log2(height);
-  int32_t  transform_shift   = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_height + log2_tr_width) >> 1);  // Represents scaling through forward transform
+  const uint32_t log2_block_width = uvg_g_convert_to_log2[width];
+  const uint32_t log2_block_height = uvg_g_convert_to_log2[height];
+
+  int32_t  transform_shift   = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_block_width + log2_block_height) >> 1);  // Represents scaling through forward transform
   uint16_t go_rice_param     = 0;
   uint32_t reg_bins = (width * height * 28) >> 4;
-  const uint32_t log2_block_width  = uvg_g_convert_to_log2[width];
-  const uint32_t log2_block_height = uvg_g_convert_to_log2[height];
+  
   int32_t  scalinglist_type= (block_type == CU_INTRA ? 0 : 3) + color;
 
   int32_t qp_scaled = uvg_get_scaled_qp(color, state->qp, (encoder->bitdepth - 8) * 6, encoder->qp_map[0]);
@@ -1407,8 +1406,8 @@ void uvg_rdoq(
 
   const double lambda = color ? state->c_lambda : state->lambda;
 
-  const int32_t *quant_coeff  = encoder->scaling_list.quant_coeff[log2_tr_width][log2_tr_height][scalinglist_type][qp_scaled%6];
-  const double *err_scale     = encoder->scaling_list.error_scale[log2_tr_width][log2_tr_height][scalinglist_type][qp_scaled%6];
+  const int32_t *quant_coeff  = encoder->scaling_list.quant_coeff[log2_block_width][log2_block_height][scalinglist_type][qp_scaled%6];
+  const double *err_scale     = encoder->scaling_list.error_scale[log2_block_width][log2_block_height][scalinglist_type][qp_scaled%6];
 
   double block_uncoded_cost = 0;
   
@@ -1422,7 +1421,6 @@ void uvg_rdoq(
 
   memset(dest_coeff, 0, sizeof(coeff_t) * width * height);
 
-  // ISP_TODO: height
   const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_width][log2_block_height][0] + uvg_g_log2_sbb_size[log2_block_width][log2_block_height][1];
   const uint32_t log2_cg_width  = uvg_g_log2_sbb_size[log2_block_width][log2_block_height][0];
   const uint32_t log2_cg_height = uvg_g_log2_sbb_size[log2_block_width][log2_block_height][1];
@@ -1430,8 +1428,8 @@ void uvg_rdoq(
   const uint32_t cg_width  = (MIN((uint8_t)TR_MAX_WIDTH, width) >> log2_cg_width);
   const uint32_t cg_height = (MIN((uint8_t)TR_MAX_WIDTH, height) >> log2_cg_height);
 
-  const uint32_t *scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_mode, log2_block_width, log2_block_height);
-  const uint32_t *scan_cg = uvg_get_scan_order_table(SCAN_GROUP_UNGROUPED, scan_mode, log2_block_width, log2_block_height);
+  const uint32_t * const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_mode, log2_block_width, log2_block_height);
+  const uint32_t * const scan_cg = uvg_get_scan_order_table(SCAN_GROUP_UNGROUPED, scan_mode, log2_block_width, log2_block_height);
 
   const uint32_t cg_size = 16;
   const int32_t  shift = 4 >> 1;
diff --git a/src/search_intra.c b/src/search_intra.c
index ee06077b..cf25936d 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -190,30 +190,27 @@ static void get_cost_dual(encoder_state_t * const state,
 * \param lcu_px   Position of the top left pixel of current CU within current LCU.
 */
 static void derive_mts_constraints(cu_info_t *const pred_cu,
-                                   lcu_t *const lcu, const int depth,
+                                   lcu_t *const lcu, const int width, const int height,
                                    const vector2d_t lcu_px)
 {
-  const int width = LCU_WIDTH >> depth;
-  const int height = width; // ISP_TODO: height
-  int8_t scan_idx = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode, depth);
+  int8_t scan_idx = SCAN_DIAG;
   int32_t i;
   // ToDo: large block support in VVC?
   uint32_t sig_coeffgroup_flag[32 * 32] = { 0 };
 
   const uint32_t log2_block_width =  uvg_g_convert_to_log2[width];
   const uint32_t log2_block_height = uvg_g_convert_to_log2[height];
-  const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_width][log2_block_width][0]
-    + uvg_g_log2_sbb_size[log2_block_width][log2_block_width][1]; // ISP_TODO: height
-  const uint32_t *scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_idx, log2_block_width, log2_block_height);
-  const uint32_t *scan_cg = uvg_get_scan_order_table(SCAN_GROUP_UNGROUPED, scan_idx, log2_block_width, log2_block_height);
+  const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_width][log2_block_height][0]
+    + uvg_g_log2_sbb_size[log2_block_width][log2_block_height][1];
+  const uint32_t * const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_idx, log2_block_width, log2_block_height);
+  const uint32_t * const scan_cg = uvg_get_scan_order_table(SCAN_GROUP_UNGROUPED, scan_idx, log2_block_width, log2_block_height);
 
   const coeff_t* coeff = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, lcu_px.x, lcu_px.y)];
 
   signed scan_cg_last = -1;
   signed scan_pos_last = -1;
 
-  // ISP_TODO: height
-  for (int i = 0; i < width * width; i++) {
+  for (int i = 0; i < width * height; i++) {
     if (coeff[scan[i]]) {
       scan_pos_last = i;
       sig_coeffgroup_flag[scan_cg[i >> log2_cg_size]] = 1;
@@ -405,7 +402,7 @@ static double search_intra_trdepth(
 
         if (trafo != 0 && !cbf_is_set(pred_cu->cbf, depth, COLOR_Y)) continue;
         
-        derive_mts_constraints(pred_cu, lcu, depth, lcu_px);
+        derive_mts_constraints(pred_cu, lcu, width, height, lcu_px);
         if (pred_cu->tr_idx > 1) {
           if (pred_cu->violates_mts_coeff_constraint || !pred_cu->
               mts_last_scan_pos) {
diff --git a/src/strategies/avx2/dct-avx2.c b/src/strategies/avx2/dct-avx2.c
index 35890e91..4197f17a 100644
--- a/src/strategies/avx2/dct-avx2.c
+++ b/src/strategies/avx2/dct-avx2.c
@@ -1598,7 +1598,6 @@ static void mts_dct_avx2(
 {
   tr_type_t type_hor;
   tr_type_t type_ver;
-  // ISP_TODO: height passed but not used
 
   uvg_get_tr_type(width, height, color, tu, &type_hor, &type_ver, mts_idx);
 
diff --git a/src/strategies/avx2/quant-avx2.c b/src/strategies/avx2/quant-avx2.c
index 75a3ff8f..b6d062b0 100644
--- a/src/strategies/avx2/quant-avx2.c
+++ b/src/strategies/avx2/quant-avx2.c
@@ -501,7 +501,6 @@ void uvg_quant_avx2(const encoder_state_t * const state, const coeff_t * __restr
     __m256i v_coef, q_coefs;
     __m256i v_quant_coeff_lo, v_quant_coeff_hi;
 
-    // ISP_TODO: do these avx common functions need height?
     scanord_read_vector(coeffs, scan, scan_idx, subpos, width, result_coeffs, 2);
 
     v_coef  = result_coeffs[0];
diff --git a/src/strategies/generic/dct-generic.c b/src/strategies/generic/dct-generic.c
index c790034f..8798da11 100644
--- a/src/strategies/generic/dct-generic.c
+++ b/src/strategies/generic/dct-generic.c
@@ -739,12 +739,6 @@ static void idct_ ## n ## x ## n ## _generic(int8_t bitdepth, const int16_t *inp
   partial_butterfly_inverse_ ## n ## _generic(tmp, output, shift_2nd); \
 }
 
-
-//static void dct_non_square_generic(int8_t bitdepth, const int16_t* input, int16_t* output)
-//{
-//  // ISP_TODO: non-square transform here
-//}
-
 DCT_NXN_GENERIC(4);
 DCT_NXN_GENERIC(8);
 DCT_NXN_GENERIC(16);
diff --git a/src/strategies/generic/encode_coding_tree-generic.c b/src/strategies/generic/encode_coding_tree-generic.c
index 145bafc4..242e86bc 100644
--- a/src/strategies/generic/encode_coding_tree-generic.c
+++ b/src/strategies/generic/encode_coding_tree-generic.c
@@ -80,8 +80,8 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
   const uint32_t log2_block_height = uvg_g_convert_to_log2[height];
   
   const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_width][log2_block_height][0] + uvg_g_log2_sbb_size[log2_block_width][log2_block_height][1];
-  const uint32_t* scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_mode, log2_block_width, log2_block_height);
-  const uint32_t* scan_cg = uvg_get_scan_order_table(SCAN_GROUP_UNGROUPED, scan_mode, log2_block_width, log2_block_height);
+  const uint32_t* const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_mode, log2_block_width, log2_block_height);
+  const uint32_t* const scan_cg = uvg_get_scan_order_table(SCAN_GROUP_UNGROUPED, scan_mode, log2_block_width, log2_block_height);
 
 
   // Init base contexts according to block type
diff --git a/src/strategies/generic/intra-generic.c b/src/strategies/generic/intra-generic.c
index 14418f35..faf476e1 100644
--- a/src/strategies/generic/intra-generic.c
+++ b/src/strategies/generic/intra-generic.c
@@ -124,7 +124,6 @@ static void uvg_angular_pred_generic(
   const bool vertical_mode = intra_mode >= 34;
   // Modes distance to horizontal or vertical mode.
   const int_fast8_t mode_disp = vertical_mode ? pred_mode - 50 : -((int32_t)pred_mode - 18);
-  //const int_fast8_t mode_disp = vertical_mode ? intra_mode - 26 : 10 - intra_mode;
   
   // Sample displacement per column in fractions of 32.
   const int_fast8_t sample_disp = (mode_disp < 0 ? -1 : 1) * modedisp2sampledisp[abs(mode_disp)];
@@ -140,23 +139,6 @@ static void uvg_angular_pred_generic(
   // Set ref_main and ref_side such that, when indexed with 0, they point to
   // index 0 in block coordinates.
   if (sample_disp < 0) {
-
-    // ISP_TODO: might be able to use memcpy instead of loops here, should be a bit faster.
-    /*if (vertical_mode) {
-      for (int i = 0; i <= width + 1 + multi_ref_index; i++) {
-        temp_main[width + i] = in_ref_above[i];
-      }
-      for (int j = 0; j <= height + 1 + multi_ref_index; j++) {
-        temp_side[height + j] = in_ref_left[j];
-      }
-    } else {
-      for (int i = 0; i <= width + 1 + multi_ref_index; i++) {
-        temp_side[width + i] = in_ref_above[i];
-      }
-      for (int j = 0; j <= height + 1 + multi_ref_index; j++) {
-        temp_main[height + j] = in_ref_left[j];
-      }
-    }*/
     memcpy(&temp_above[height], &in_ref_above[0], (width + 2 + multi_ref_index) * sizeof(uvg_pixel));
     memcpy(&temp_left[width], &in_ref_left[0], (height + 2 + multi_ref_index) * sizeof(uvg_pixel));
 
@@ -259,10 +241,6 @@ static void uvg_angular_pred_generic(
           int filter_threshold = uvg_intra_hor_ver_dist_thres[log2_width];
           int dist_from_vert_or_hor = MIN(abs((int32_t)pred_mode - 50), abs((int32_t)pred_mode - 18));
           if (dist_from_vert_or_hor > filter_threshold) {
-            // ISP_TODO: these are introduced in the beginning of this function or am I missing something?
-            static const int16_t modedisp2sampledisp[32] = { 0,    1,    2,    3,    4,    6,     8,   10,   12,   14,   16,   18,   20,   23,   26,   29,   32,   35,   39,  45,  51,  57,  64,  73,  86, 102, 128, 171, 256, 341, 512, 1024 };
-            const int_fast8_t mode_disp = (pred_mode >= 34) ? pred_mode - 50 : 18 - pred_mode;
-            const int_fast8_t sample_disp = (mode_disp < 0 ? -1 : 1) * modedisp2sampledisp[abs(mode_disp)];
             if ((abs(sample_disp) & 0x1F) != 0)
             {
               use_cubic = false;
@@ -361,8 +339,8 @@ static void uvg_angular_pred_generic(
     // Mode is horizontal or vertical, just copy the pixels.
     
     // Do not apply PDPC if multi ref line index is other than 0
-    // ISP_TODO: do not do PDPC if block is in BDPCM mode
-    bool do_pdpc = (((width >= 4 && height >= 4) || channel_type != 0) && sample_disp >= 0 && multi_ref_index == 0);
+    // TODO: do not do PDPC if block is in BDPCM mode
+    bool do_pdpc = (((width >= 4 && height >= 4) || channel_type != 0) && sample_disp >= 0 && multi_ref_index == 0 /*&& !bdpcm*/);
 
     if (do_pdpc) {
       int scale = (log2_width + log2_height - 2) >> 2;
@@ -381,25 +359,6 @@ static void uvg_angular_pred_generic(
         memcpy(&dst[y * width], &ref_main[1], width * sizeof(uvg_pixel));
       }
     }
-    // ISP_TODO: there is no reason to run these loops AND then check if PDPC is applied. Do the check first and then run either the normal or PDPC loops
-
-    //for (int_fast32_t y = 0; y < height; ++y) {
-    //  for (int_fast32_t x = 0; x < width; ++x) {
-    //    dst[y * width + x] = ref_main[x + 1];
-    //  }
-    //  // Do not apply PDPC if multi ref line index is other than 0
-    //  // ISP_TODO: do not do PDPC if block is in BDPCM mode
-    //  if (((width >= 4 && height >= 4) || channel_type != 0) && sample_disp >= 0 && multi_ref_index == 0) {
-    //    int scale = (log2_width + log2_height - 2) >> 2;
-    //    const uvg_pixel top_left = ref_main[0];
-    //    const uvg_pixel left = ref_side[1 + y];
-    //    for (int i = 0; i < MIN(3 << scale, width); i++) { // ISP_TODO: is one loop enough for PDPC?
-    //      const int wL = 32 >> (2 * i >> scale);
-    //      const uvg_pixel val = dst[y * width + i];
-    //      dst[y * width + i] = CLIP_TO_PIXEL(val + ((wL * (left - top_left) + 32) >> 6));
-    //    }
-    //  }
-    //}
   }
 
   // Flip the block if this is was a horizontal mode.
diff --git a/src/strategies/generic/quant-generic.c b/src/strategies/generic/quant-generic.c
index e0c2744c..3de27958 100644
--- a/src/strategies/generic/quant-generic.c
+++ b/src/strategies/generic/quant-generic.c
@@ -255,7 +255,7 @@ int uvg_quant_cbcr_residual_generic(
   ALIGNED(64) int16_t v_residual[TR_MAX_WIDTH * TR_MAX_WIDTH];
   ALIGNED(64) int16_t combined_residual[TR_MAX_WIDTH * TR_MAX_WIDTH];
   ALIGNED(64) coeff_t coeff[TR_MAX_WIDTH * TR_MAX_WIDTH];
-  // ISP_TODO: this function is not fully converted to handle non-square blocks
+  // TODO: this function is not fully converted to handle non-square blocks
   {
     int y, x;
     for (y = 0; y < height; ++y) {
@@ -494,7 +494,7 @@ int uvg_quantize_residual_generic(encoder_state_t *const state,
   // Quantize coeffs. (coeff -> coeff_out)
   
   if (state->encoder_control->cfg.rdoq_enable &&
-      (width > 4 || !state->encoder_control->cfg.rdoq_skip) && !use_trskip) // ISP_TODO: width check here might not be necessary, therefore also height check unnecessary. Investigate.
+      (width > 4 || !state->encoder_control->cfg.rdoq_skip) && !use_trskip)
   {
     int8_t tr_depth = cur_cu->tr_depth - cur_cu->depth;
     tr_depth += (cur_cu->part_size == SIZE_NxN ? 1 : 0);
diff --git a/src/tables.c b/src/tables.c
index 0d51f2f4..dec8b467 100644
--- a/src/tables.c
+++ b/src/tables.c
@@ -2573,7 +2573,7 @@ static const uint32_t const g_scan_order_buffer[32258] = {
 
 // Get scan order table based on scan group type (grouped or ungrouped)
 // and log2 block width and height index
-static const uint32_t* g_scan_order[SCAN_GROUP_TYPES][MAX_LOG2_INDEX][MAX_LOG2_INDEX] =
+static const uint32_t* const g_scan_order[SCAN_GROUP_TYPES][MAX_LOG2_INDEX][MAX_LOG2_INDEX] =
 {
   {
     { g_scan_order_buffer + 0, g_scan_order_buffer + 1, g_scan_order_buffer + 3, g_scan_order_buffer + 7, g_scan_order_buffer + 15, g_scan_order_buffer + 31, g_scan_order_buffer + 63, },
@@ -2606,16 +2606,15 @@ static const uint32_t* g_scan_order[SCAN_GROUP_TYPES][MAX_LOG2_INDEX][MAX_LOG2_I
  *
  * \return  Returns pointer to scan order table based on given dimensions.
  */
-uint32_t* uvg_get_scan_order_table(int scan_group, int scan_type, int log2_w, int log2_h)
+const uint32_t* const uvg_get_scan_order_table(int scan_group, int scan_type, int log2_w, int log2_h)
 {
-  // ISP_TODO: horizontal and vertical scan types
+  // TODO: horizontal and vertical scan types
   assert(scan_type == SCAN_DIAG && "Horizontal and vertical scan not implemented.");
 
   if (scan_group == SCAN_GROUP_4X4) {
     return g_scan_order[scan_group][log2_w][log2_h];
   }
   else {
-    // ISP_TODO: returning coef group type does not work yet. It will break for non-square blocks
     return g_scan_order[scan_group][log2_w - 2][log2_h - 2];
   }
 }
diff --git a/src/tables.h b/src/tables.h
index 0d52ea87..44621251 100644
--- a/src/tables.h
+++ b/src/tables.h
@@ -143,6 +143,6 @@ extern const uint32_t uvg_g_log2_sbb_size[7 + 1][7 + 1][2];
 #define SCAN_GROUP_UNGROUPED 0
 #define SCAN_GROUP_4X4 1
 
-uint32_t* uvg_get_scan_order_table(int scan_group, int scan_type, int log2_w, int log2_h);
+const uint32_t* const uvg_get_scan_order_table(int scan_group, int scan_type, int log2_w, int log2_h);
 
 #endif //TABLES_H_
diff --git a/src/transform.c b/src/transform.c
index 4738f942..ffe3c05b 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -1194,7 +1194,6 @@ static void quantize_tr_residual(
   }
 
   if (cfg->lossless) {
-    // ISP_TODO: is there any sensible case where in and out strides would be different?
     has_coeffs = bypass_transquant(tr_width,
                                    tr_height,
                                    lcu_width, // in stride

From bbb8faea986898e68b752f9803ed764558b1ccf9 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Sun, 21 Aug 2022 12:46:07 +0300
Subject: [PATCH 039/254] [isp] Modify encode transform coeff func to handle
 non-square blocks, use cu_loc_t where possible. Fix mistake in mts idct
 generic.

---
 src/encode_coding_tree.c             | 59 ++++++++++++++++++----------
 src/global.h                         |  4 +-
 src/intra.c                          | 12 +++---
 src/search.c                         |  8 ++--
 src/strategies/generic/dct-generic.c |  2 +-
 5 files changed, 51 insertions(+), 34 deletions(-)

diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index 1b360926..33c51c30 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -516,8 +516,7 @@ static void encode_chroma_tu(
 
 static void encode_transform_unit(
   encoder_state_t * const state,
-  int x,
-  int y,
+  const cu_loc_t *cu_loc,
   int depth,
   bool only_chroma,
   lcu_coeff_t* coeff,
@@ -528,11 +527,12 @@ static void encode_transform_unit(
 
   const videoframe_t * const frame = state->tile->frame;
   cabac_data_t* const cabac = &state->cabac;
-  const uint8_t width = LCU_WIDTH >> depth;
-  const uint8_t width_c = (depth == MAX_PU_DEPTH ? width : width / 2);
-  // TODO: height for non-square blocks
-  const uint8_t height = width;
-  const uint8_t height_c = width_c;
+  const int x = cu_loc->x;
+  const int y = cu_loc->y;
+  const uint8_t width = cu_loc->width;
+  const uint8_t height = cu_loc->height;
+  const uint8_t width_c  = cu_loc->chroma_width;
+  const uint8_t height_c = cu_loc->chroma_height;
 
   cu_array_t* used_cu_array = tree_type != UVG_CHROMA_T ? frame->cu_array : frame->chroma_cu_array;
   const cu_info_t *cur_pu = uvg_cu_array_at_const(used_cu_array, x, y);
@@ -604,8 +604,7 @@ static void encode_transform_unit(
  */
 static void encode_transform_coeff(
   encoder_state_t * const state,
-  int32_t x,
-  int32_t y,
+  const cu_loc_t * cu_loc,
   int8_t depth,
   int8_t tr_depth,
   uint8_t parent_coeff_u,
@@ -616,6 +615,11 @@ static void encode_transform_coeff(
   bool last_split)                // Always true except when writing sub partition coeffs (ISP)
 {
   cabac_data_t * const cabac = &state->cabac;
+  const int x = cu_loc->x;
+  const int y = cu_loc->y;
+  const int width = cu_loc->width;
+  const int height = cu_loc->height;
+
   //const encoder_control_t *const ctrl = state->encoder_control;
   const videoframe_t * const frame = state->tile->frame;
   const cu_array_t* used_array = tree_type != UVG_CHROMA_T ? frame->cu_array : frame->chroma_cu_array;
@@ -687,13 +691,17 @@ static void encode_transform_coeff(
   }
 
   if (split) {
-    uint8_t offset = LCU_WIDTH >> (depth + 1);
-    int x2 = x + offset;
-    int y2 = y + offset;
-    encode_transform_coeff(state, x,  y,  depth + 1, tr_depth + 1, cb_flag_u, cb_flag_v, only_chroma, coeff, tree_type, true);
-    encode_transform_coeff(state, x2, y,  depth + 1, tr_depth + 1, cb_flag_u, cb_flag_v, only_chroma, coeff, tree_type, true);
-    encode_transform_coeff(state, x,  y2, depth + 1, tr_depth + 1, cb_flag_u, cb_flag_v, only_chroma, coeff, tree_type, true);
-    encode_transform_coeff(state, x2, y2, depth + 1, tr_depth + 1, cb_flag_u, cb_flag_v, only_chroma, coeff, tree_type, true);
+    int split_width  = width >> 1;
+    int split_height = height >> 1;
+
+    for (int j = 0; j < 2; j++) {
+      for (int i = 0; i < 2; i++) {
+        cu_loc_t loc;
+        uvg_cu_loc_ctor(&loc, (x + i * split_width), (y + j * split_height), width >> 1, height >> 1);
+
+        encode_transform_coeff(state, &loc, depth + 1, tr_depth + 1, cb_flag_u, cb_flag_v, only_chroma, coeff, tree_type, true);
+      }
+    }
     return;
   }
 
@@ -743,7 +751,8 @@ static void encode_transform_coeff(
       cabac->cur_ctx = &cabac->ctx.joint_cb_cr[cb_flag_u * 2 + cb_flag_v - 1];
       CABAC_BIN(cabac, cur_pu->joint_cb_cr != 0, "tu_joint_cbcr_residual_flag");
     }
-    encode_transform_unit(state, x, y, depth, only_chroma, coeff, tree_type, last_split);
+
+    encode_transform_unit(state, cu_loc, depth, only_chroma, coeff, tree_type, last_split);
   }
 }
 
@@ -1372,6 +1381,9 @@ void uvg_encode_coding_tree(
   const int cu_height = cu_width; // TODO: height for non-square blocks
   const int half_cu  = cu_width >> 1;
 
+  cu_loc_t cu_loc;
+  uvg_cu_loc_ctor(&cu_loc, x, y, cu_width, cu_height);
+
   const cu_info_t *left_cu  = NULL;
   if (x > 0) {
     left_cu = uvg_cu_array_at_const(used_array, x - 1, y);
@@ -1575,6 +1587,7 @@ void uvg_encode_coding_tree(
     bool non_zero_mvd = false;
 
     for (int i = 0; i < num_pu; ++i) {
+      // TODO: height for non-square blocks
       const int pu_x = PU_GET_X(cur_cu->part_size, cu_width, x, i);
       const int pu_y = PU_GET_Y(cur_cu->part_size, cu_width, y, i);
       const int pu_w = PU_GET_W(cur_cu->part_size, cu_width, i);
@@ -1610,9 +1623,8 @@ void uvg_encode_coding_tree(
         CABAC_BIN(cabac, cbf, "rqt_root_cbf");
       }
       // Code (possible) coeffs to bitstream
-
       if (cbf) {
-        encode_transform_coeff(state, x, y, depth, 0, 0, 0, 0, coeff, tree_type, true);
+        encode_transform_coeff(state, &cu_loc, depth, 0, 0, 0, 0, coeff, tree_type, true);
       }
 
       encode_mts_idx(state, cabac, cur_cu);
@@ -1643,10 +1655,15 @@ void uvg_encode_coding_tree(
       for (int part = 0; part < limit; part += part_dim) {
         const int part_x = split_type == ISP_MODE_HOR ? x : x + part;
         const int part_y = split_type == ISP_MODE_HOR ? y + part : y;
+        const int part_w = split_type == ISP_MODE_HOR ? cu_width : part_dim;
+        const int part_h = split_type == ISP_MODE_HOR ? part_dim : cu_height;
         
+        cu_loc_t loc;
+        uvg_cu_loc_ctor(&loc, part_x, part_y, part_w, part_h);
+
         // Check if last split to write chroma
         bool last_split = (part + part_dim) == limit;
-        encode_transform_coeff(state, part_x, part_y, depth, 0, 0, 0, 0, coeff, tree_type, last_split);
+        encode_transform_coeff(state, &loc, depth, 0, 0, 0, 0, coeff, tree_type, last_split);
       }
     }
 
@@ -1665,7 +1682,7 @@ void uvg_encode_coding_tree(
       tmp->violates_lfnst_constrained_luma = false;
       tmp->violates_lfnst_constrained_chroma = false;
       tmp->lfnst_last_scan_pos = false;
-      encode_transform_coeff(state, x, y, depth, 0, 0, 0, 1, coeff, tree_type, true);
+      encode_transform_coeff(state, &cu_loc, depth, 0, 0, 0, 1, coeff, tree_type, true);
       // Write LFNST only once for single tree structure
       encode_lfnst_idx(state, cabac, tmp, x, y, depth, cu_width, cu_height, tree_type, COLOR_UV);
     }
diff --git a/src/global.h b/src/global.h
index 65ca2fa9..907da8b7 100644
--- a/src/global.h
+++ b/src/global.h
@@ -129,8 +129,8 @@ typedef int16_t coeff_t;
 typedef int32_t mv_t;
 
 //#define VERBOSE 1
-//#define UVG_DEBUG_PRINT_CABAC 1
-//#define UVG_DEBUG 1
+#define UVG_DEBUG_PRINT_CABAC 1
+#define UVG_DEBUG 1
 
 //#define UVG_DEBUG_PRINT_YUVIEW_CSV 1
 //#define UVG_DEBUG_PRINT_MV_INFO 1
diff --git a/src/intra.c b/src/intra.c
index f0b79d2e..c664372e 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -1062,7 +1062,7 @@ void uvg_intra_build_reference_any(
     // Limit the number of available pixels based on block size and dimensions
     // of the picture.
     // TODO: height for non-square blocks
-    px_available_left = MIN(px_available_left, width * 2 + multi_ref_index);
+    px_available_left = MIN(px_available_left, height * 2 + multi_ref_index);
     px_available_left = MIN(px_available_left, (pic_px->y - luma_px->y) >> is_chroma);
 
     // Copy pixels from coded CUs.
@@ -1072,13 +1072,13 @@ void uvg_intra_build_reference_any(
     }
     // Extend the last pixel for the rest of the reference values.
     uvg_pixel nearest_pixel = left_border[(px_available_left - 1) * left_stride];
-    for (int i = px_available_left; i < width * 2 + multi_ref_index * 2; ++i) {
+    for (int i = px_available_left; i < height * 2 + multi_ref_index * 2; ++i) {
       out_left_ref[i + 1 + multi_ref_index] = nearest_pixel;
     }
   } else {
     // If we are on the left edge, extend the first pixel of the top row.
     uvg_pixel nearest_pixel = luma_px->y > 0 ? top_border[0] : dc_val;
-    for (int i = 0; i < width * 2 + multi_ref_index; i++) {
+    for (int i = 0; i < height * 2 + multi_ref_index; i++) {
       // Reserve space for top left reference
       out_left_ref[i + 1 + multi_ref_index] = nearest_pixel;
     }
@@ -1672,10 +1672,10 @@ void uvg_intra_recon_cu(
     int split_num = 0;
     for (int part = 0; part < limit; part += part_dim) {
       cbf_clear(&cur_cu->cbf, depth, COLOR_Y);
-      const int part_x = split_type == ISP_MODE_HOR ? x : x + part;
+      const int part_x = split_type == ISP_MODE_HOR ? x: x + part;
       const int part_y = split_type == ISP_MODE_HOR ? y + part: y;
-      const int part_w = split_type == ISP_MODE_HOR ? part_dim : width;
-      const int part_h = split_type == ISP_MODE_HOR ? height : part_dim;
+      const int part_w = split_type == ISP_MODE_HOR ? width : part_dim;
+      const int part_h = split_type == ISP_MODE_HOR ? part_dim : height;
 
       cu_loc_t loc;
       uvg_cu_loc_ctor(&loc, part_x, part_y, part_w, part_h);
diff --git a/src/search.c b/src/search.c
index f9b58439..4e1bcaa7 100644
--- a/src/search.c
+++ b/src/search.c
@@ -392,8 +392,8 @@ double uvg_cu_rd_cost_luma(const encoder_state_t *const state,
       for (int part = 0; part < limit; part += part_dim) {
         const int part_x = split_type == ISP_MODE_HOR ? x_px : x_px + part;
         const int part_y = split_type == ISP_MODE_HOR ? y_px + part : y_px;
-        const int part_w = split_type == ISP_MODE_HOR ? part_dim : width;
-        const int part_h = split_type == ISP_MODE_HOR ? height : part_dim;
+        const int part_w = split_type == ISP_MODE_HOR ? width : part_dim;
+        const int part_h = split_type == ISP_MODE_HOR ? part_dim : height;
 
         const coeff_t* coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, part_x, part_y)];
 
@@ -629,8 +629,8 @@ static double cu_rd_cost_tr_split_accurate(
       for (int part = 0; part < limit; part += part_dim) {
         const int part_x = split_type == ISP_MODE_HOR ? x_px : x_px + part;
         const int part_y = split_type == ISP_MODE_HOR ? y_px + part : y_px;
-        const int part_w = split_type == ISP_MODE_HOR ? part_dim : width;
-        const int part_h = split_type == ISP_MODE_HOR ? height : part_dim;
+        const int part_w = split_type == ISP_MODE_HOR ? width : part_dim;
+        const int part_h = split_type == ISP_MODE_HOR ? part_dim : height;
 
         const coeff_t* coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, part_x, part_y)];
 
diff --git a/src/strategies/generic/dct-generic.c b/src/strategies/generic/dct-generic.c
index 8798da11..03453b90 100644
--- a/src/strategies/generic/dct-generic.c
+++ b/src/strategies/generic/dct-generic.c
@@ -2574,7 +2574,7 @@ static void mts_idct_generic(
     }
 
     partial_tr_func* idct_hor = idct_table[type_hor][log2_width_minus2];
-    partial_tr_func* idct_ver = idct_table[type_ver][log2_width_minus2];
+    partial_tr_func* idct_ver = idct_table[type_ver][log2_height_minus2];
 
     int16_t tmp[32 * 32];
     const int max_log2_tr_dynamic_range = 15;

From f86dc29ce7d51ca9dd0c7f375f5f040490c190d0 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Mon, 22 Aug 2022 14:01:26 +0300
Subject: [PATCH 040/254] [isp] Fix mistake in cost calculation. Remove some
 commented out code blocks.

---
 src/encode_coding_tree.c | 24 +-----------------------
 src/search.c             |  2 +-
 2 files changed, 2 insertions(+), 24 deletions(-)

diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index 33c51c30..7e8d76e0 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -1014,9 +1014,8 @@ void uvg_encode_intra_luma_coding_unit(const encoder_state_t * const state,
   }
   */
   
-  // Intra Subpartition mode
   uint32_t width = (LCU_WIDTH >> depth);
-  uint32_t height = (LCU_WIDTH >> depth);
+  uint32_t height = (LCU_WIDTH >> depth); // TODO: height for non-square blocks
 
   // Code MIP related bits
   bool enable_mip = state->encoder_control->cfg.mip;
@@ -1079,10 +1078,6 @@ void uvg_encode_intra_luma_coding_unit(const encoder_state_t * const state,
   bool allow_isp = enable_isp ? uvg_can_use_isp(width, height, 64 /*MAX_TR_SIZE*/) : false;
   uint8_t isp_mode = allow_isp ? cur_cu->intra.isp_mode : 0;
 
-  // ToDo: add height comparison
-  //isp_mode += ((width > TR_MAX_WIDTH) || !enough_samples) ? 1 : 0;
-  //isp_mode += ((height > TR_MAX_WIDTH) || !enough_samples) ? 2 : 0;
-
   if (allow_isp && !multi_ref_idx /*&& !bdpcm && !color_transform*/) {
     if (isp_mode == ISP_MODE_NO_ISP) {
       CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.intra_subpart_model[0]), 0, bits, "intra_subpartitions_mode");
@@ -1093,23 +1088,6 @@ void uvg_encode_intra_luma_coding_unit(const encoder_state_t * const state,
     }
   }
 
-  
-  //if (allow_isp && !multi_ref_idx /*&& !bdpcm && !color_transform*/) {
-  //  if (isp_mode == ISP_MODE_NO_ISP) {
-  //    if (isp_mode) {
-  //      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.intra_subpart_model[0]), 0, bits, "intra_subPartitions");
-  //    }
-  //    else {
-  //      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.intra_subpart_model[0]), 1, bits, "intra_subPartitions");
-  //      // ToDo: complete this if-clause
-  //      if (isp_mode == 3) {
-  //        CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.intra_subpart_model[0]), allow_isp - 1, bits, "intra_subPart_ver_hor");
-  //      }
-  //    }
-  //  }
-  //}
-  
-
   const int cu_width = LCU_WIDTH >> depth;
     // PREDINFO CODING
     // If intra prediction mode is found from the predictors,
diff --git a/src/search.c b/src/search.c
index 4e1bcaa7..215c329b 100644
--- a/src/search.c
+++ b/src/search.c
@@ -567,7 +567,7 @@ static double cu_rd_cost_tr_split_accurate(
 
   const bool is_isp = !(pred_cu->type == CU_INTER || pred_cu->intra.isp_mode == ISP_MODE_NO_ISP);
   // Add transform_tree cbf_luma bit cost.
-  if (is_isp) {
+  if (!is_isp) {
     const int is_tr_split = depth - tr_cu->depth;
     if ((pred_cu->type == CU_INTRA ||
       is_tr_split ||

From 510798cb3dfc11e16c1b8d1e17c770888e948eb8 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Tue, 23 Aug 2022 13:20:57 +0300
Subject: [PATCH 041/254] [isp] Fix mistake in isp cabac write. Intra luma mpm
 flag bit was checking isp when it did not need to.

---
 src/encode_coding_tree.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index 7e8d76e0..83ee8585 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -1143,8 +1143,8 @@ void uvg_encode_intra_luma_coding_unit(const encoder_state_t * const state,
   }
   // Is the mode in the MPM array or not
   flag = (mpm_preds == -1) ? 0 : 1;
-  if (!(cur_pu->intra.multi_ref_idx || (isp_mode))) {
-    CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.intra_luma_mpm_flag_model), flag, bits, "prev_intra_luma_pred_flag");
+  if (cur_pu->intra.multi_ref_idx == 0) {
+    CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.intra_luma_mpm_flag_model), flag, bits, "intra_luma_mpm_flag");
   }
     
   // Signal index of the prediction mode in the prediction list, if it is there

From 56ebea735847b4145fd096722b15bd1c0630d64d Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Tue, 23 Aug 2022 17:27:57 +0300
Subject: [PATCH 042/254] [isp] Set cbfs for isp splits after search. Add
 helper function for isp split number.

---
 src/encode_coding_tree.c | 18 +++-----------
 src/global.h             |  4 +--
 src/intra.c              | 48 +++++++++++++++++++++++++-----------
 src/intra.h              |  2 ++
 src/search.c             | 53 ++++++++++++++++++++++++++++------------
 5 files changed, 79 insertions(+), 46 deletions(-)

diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index 83ee8585..175aeb4d 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -1623,24 +1623,14 @@ void uvg_encode_coding_tree(
       // ISP split is done horizontally or vertically depending on ISP mode, 2 or 4 times depending on block dimensions.
       // Small blocks are split only twice.
       int split_type = cur_cu->intra.isp_mode;
+      int split_limit = split_type == ISP_MODE_NO_ISP ? 1 : uvg_get_isp_split_num(cu_width, cu_height, split_type);
 
-      int part_dim = cu_width;
-      if (split_type != ISP_MODE_NO_ISP) {
-        part_dim = uvg_get_isp_split_dim(cu_width, cu_height, split_type);
-      }
-      int limit = split_type == ISP_MODE_HOR ? cu_height : cu_width;
-
-      for (int part = 0; part < limit; part += part_dim) {
-        const int part_x = split_type == ISP_MODE_HOR ? x : x + part;
-        const int part_y = split_type == ISP_MODE_HOR ? y + part : y;
-        const int part_w = split_type == ISP_MODE_HOR ? cu_width : part_dim;
-        const int part_h = split_type == ISP_MODE_HOR ? part_dim : cu_height;
-        
+      for (int i = 0; i < split_limit; ++i) {
         cu_loc_t loc;
-        uvg_cu_loc_ctor(&loc, part_x, part_y, part_w, part_h);
+        uvg_get_isp_split_loc(&loc, x, y, cu_width, cu_height, i, split_type);
 
         // Check if last split to write chroma
-        bool last_split = (part + part_dim) == limit;
+        bool last_split = (i + 1) == split_limit;
         encode_transform_coeff(state, &loc, depth, 0, 0, 0, 0, coeff, tree_type, last_split);
       }
     }
diff --git a/src/global.h b/src/global.h
index 907da8b7..65ca2fa9 100644
--- a/src/global.h
+++ b/src/global.h
@@ -129,8 +129,8 @@ typedef int16_t coeff_t;
 typedef int32_t mv_t;
 
 //#define VERBOSE 1
-#define UVG_DEBUG_PRINT_CABAC 1
-#define UVG_DEBUG 1
+//#define UVG_DEBUG_PRINT_CABAC 1
+//#define UVG_DEBUG 1
 
 //#define UVG_DEBUG_PRINT_YUVIEW_CSV 1
 //#define UVG_DEBUG_PRINT_MV_INFO 1
diff --git a/src/intra.c b/src/intra.c
index c664372e..251ab4d2 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -1507,6 +1507,34 @@ int uvg_get_isp_split_dim(const int width, const int height, const int split_typ
 }
 
 
+int uvg_get_isp_split_num(const int width, const int height, const int split_type)
+{
+  assert((split_type != ISP_MODE_NO_ISP) && "This function cannot be called if ISP mode is 0.");
+  int split_dim = uvg_get_isp_split_dim(width, height, split_type);
+  int num = split_type == ISP_MODE_HOR ? height / split_dim : width / split_dim;
+
+  return num;
+}
+
+
+void uvg_get_isp_split_loc(cu_loc_t *loc, const int x, const int y, const int block_w, const int block_h, const int split_idx, const int split_type)
+{
+  assert((split_idx >= 0 && split_idx <= 3) && "ISP split index must be in [0, 3].");
+  int part_dim = block_w;
+  if (split_type != ISP_MODE_NO_ISP) {
+    part_dim = uvg_get_isp_split_dim(block_w, block_h, split_type);
+  }
+  const int offset = part_dim * split_idx;
+
+  const int part_x = split_type == ISP_MODE_HOR ? x : x + offset;
+  const int part_y = split_type == ISP_MODE_HOR ? y + offset : y;
+  const int part_w = split_type == ISP_MODE_HOR ? block_w : part_dim;
+  const int part_h = split_type == ISP_MODE_HOR ? part_dim : block_h;
+
+  uvg_cu_loc_ctor(loc, part_x, part_y, part_w, part_h);
+}
+
+
 static void intra_recon_tb_leaf(
   encoder_state_t* const state,
   const cu_loc_t* cu_loc,
@@ -1667,25 +1695,17 @@ void uvg_intra_recon_cu(
     // ISP split is done horizontally or vertically depending on ISP mode, 2 or 4 times depending on block dimensions.
     // Small blocks are split only twice.
     int split_type = search_data->pred_cu.intra.isp_mode;
-    int part_dim = uvg_get_isp_split_dim(width, height, split_type);
-    int limit = split_type == ISP_MODE_HOR ? height : width;
-    int split_num = 0;
-    for (int part = 0; part < limit; part += part_dim) {
-      cbf_clear(&cur_cu->cbf, depth, COLOR_Y);
-      const int part_x = split_type == ISP_MODE_HOR ? x: x + part;
-      const int part_y = split_type == ISP_MODE_HOR ? y + part: y;
-      const int part_w = split_type == ISP_MODE_HOR ? width : part_dim;
-      const int part_h = split_type == ISP_MODE_HOR ? part_dim : height;
+    int split_limit = split_type == ISP_MODE_NO_ISP ? 1 : uvg_get_isp_split_num(width, height, split_type);
 
+    for (int i = 0; i < split_limit; ++i) {
       cu_loc_t loc;
-      uvg_cu_loc_ctor(&loc, part_x, part_y, part_w, part_h);
+      uvg_get_isp_split_loc(&loc, x, y, width, height, i, split_type);
 
       intra_recon_tb_leaf(state, &loc, lcu, COLOR_Y, search_data, tree_type);
       uvg_quantize_lcu_residual(state, true, false, false,
-                                &loc, depth, cur_cu, lcu,
-                                false, tree_type);
-      search_data->best_isp_cbfs |= cbf_is_set(cur_cu->cbf, depth, COLOR_Y) << (split_num++);
-   
+        &loc, depth, cur_cu, lcu,
+        false, tree_type);
+      search_data->best_isp_cbfs |= cbf_is_set(cur_cu->cbf, depth, COLOR_Y) << (i++);
     }
   }
   const bool has_luma = recon_luma && search_data->pred_cu.intra.isp_mode == ISP_MODE_NO_ISP;
diff --git a/src/intra.h b/src/intra.h
index 6fee8f1f..51ed41c9 100644
--- a/src/intra.h
+++ b/src/intra.h
@@ -170,5 +170,7 @@ int uvg_get_mip_flag_context(int x, int y, int width, int height, const lcu_t* l
 #define SPLIT_TYPE_VER 2
 
 int uvg_get_isp_split_dim(const int width, const int height, const int split_type);
+int uvg_get_isp_split_num(const int width, const int height, const int split_type);
+void uvg_get_isp_split_loc(cu_loc_t *loc, const int x, const int y, const int block_w, const int block_h, const int split_idx, const int split_type);
 bool uvg_can_use_isp(const int width, const int height, const int max_tr_size);
 bool uvg_can_use_isp_with_lfnst(const int width, const int height, const int isp_mode, const enum uvg_tree_type tree_type);
diff --git a/src/search.c b/src/search.c
index 215c329b..ee0fc211 100644
--- a/src/search.c
+++ b/src/search.c
@@ -36,6 +36,7 @@
 #include <string.h>
 
 #include "cabac.h"
+#include "cu.h"
 #include "encoder.h"
 #include "encode_coding_tree.h"
 #include "imagelist.h"
@@ -386,15 +387,17 @@ double uvg_cu_rd_cost_luma(const encoder_state_t *const state,
     }
     else {
       int split_type = pred_cu->intra.isp_mode;
-      int part_dim = uvg_get_isp_split_dim(width, height, split_type);
-      int limit = split_type == ISP_MODE_HOR ? height : width;
-      int split_num = 0;
-      for (int part = 0; part < limit; part += part_dim) {
-        const int part_x = split_type == ISP_MODE_HOR ? x_px : x_px + part;
-        const int part_y = split_type == ISP_MODE_HOR ? y_px + part : y_px;
-        const int part_w = split_type == ISP_MODE_HOR ? width : part_dim;
-        const int part_h = split_type == ISP_MODE_HOR ? part_dim : height;
+      int split_limit = split_type == ISP_MODE_NO_ISP ? 1 : uvg_get_isp_split_num(width, height, split_type);
 
+      for (int i = 0; i < split_limit; ++i) {
+        cu_loc_t loc;
+        uvg_get_isp_split_loc(&loc, x_px, y_px, width, height, i, split_type);
+        const int part_x = loc.x;
+        const int part_y = loc.y;
+        const int part_w = loc.width;
+        const int part_h = loc.height;
+
+        // TODO: maybe just pass the cu_loc_t to these functions
         const coeff_t* coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, part_x, part_y)];
 
         coeff_bits += uvg_get_coeff_cost(state, coeffs, NULL, part_w, part_h, 0, luma_scan_mode, pred_cu->tr_idx == MTS_SKIP);
@@ -623,15 +626,17 @@ static double cu_rd_cost_tr_split_accurate(
     }
     else {
       int split_type = pred_cu->intra.isp_mode;
-      int part_dim = uvg_get_isp_split_dim(width, height, split_type);
-      int limit = split_type == ISP_MODE_HOR ? height : width;
-      int split_num = 0;
-      for (int part = 0; part < limit; part += part_dim) {
-        const int part_x = split_type == ISP_MODE_HOR ? x_px : x_px + part;
-        const int part_y = split_type == ISP_MODE_HOR ? y_px + part : y_px;
-        const int part_w = split_type == ISP_MODE_HOR ? width : part_dim;
-        const int part_h = split_type == ISP_MODE_HOR ? part_dim : height;
+      int split_limit = split_type == ISP_MODE_NO_ISP ? 1 : uvg_get_isp_split_num(width, height, split_type);
 
+      for (int i = 0; i < split_limit; ++i) {
+        cu_loc_t loc;
+        uvg_get_isp_split_loc(&loc, x_px, y_px, width, height, i, split_type);
+        const int part_x = loc.x;
+        const int part_y = loc.y;
+        const int part_w = loc.width;
+        const int part_h = loc.height;
+
+        // TODO: maybe just pass the cu_loc_t to these functions
         const coeff_t* coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, part_x, part_y)];
 
         coeff_bits += uvg_get_coeff_cost(state, coeffs, NULL, part_w, part_h, 0, luma_scan_mode, pred_cu->tr_idx == MTS_SKIP);
@@ -858,6 +863,7 @@ static double search_cu(
   const encoder_control_t* ctrl = state->encoder_control;
   const videoframe_t * const frame = state->tile->frame;
   const int cu_width = tree_type != UVG_CHROMA_T ? LCU_WIDTH >> depth : LCU_WIDTH_C >> depth;
+  const int cu_height = cu_width; // TODO: height
   const int luma_width = LCU_WIDTH >> depth;
   assert(cu_width >= 4);
   double cost = MAX_DOUBLE;
@@ -1122,6 +1128,21 @@ static double search_cu(
                          depth, &intra_search,
                          NULL, 
                          lcu, tree_type,recon_luma,recon_chroma);
+      // Set isp split cbfs here
+      const int split_type = intra_search.pred_cu.intra.isp_mode;
+      const int split_num = split_type == ISP_MODE_NO_ISP ? 1 : uvg_get_isp_split_num(cu_width, cu_height, split_type);
+      for (int i = 0; i < split_num; ++i) {
+        cu_loc_t isp_loc;
+        uvg_get_isp_split_loc(&isp_loc, x, y, cu_width, cu_height, i, split_type);
+        //search_data->best_isp_cbfs |= cbf_is_set(cur_cu->cbf, depth, COLOR_Y) << (i++);
+        cu_info_t* split_cu = LCU_GET_CU_AT_PX(lcu, isp_loc.x, isp_loc.y);
+        bool cur_cbf = (intra_search.best_isp_cbfs >> i) & 1;
+        cbf_clear(&split_cu->cbf, depth, COLOR_Y);
+        if (cur_cbf) {
+          cbf_set(&split_cu->cbf, depth, COLOR_Y);
+        }
+      }
+
       if(depth == 4 && x % 8 && y % 8 && tree_type != UVG_LUMA_T && state->encoder_control->chroma_format != UVG_CSP_400) {
         intra_search.pred_cu.intra.mode_chroma = cur_cu->intra.mode_chroma;
         uvg_intra_recon_cu(state,

From b53308f258d62f42eaf469fec0c89143849e2908 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Wed, 24 Aug 2022 15:11:01 +0300
Subject: [PATCH 043/254] [isp] Fix mistake in setting cbfs. Skip stting if ISP
 is not used.

---
 src/search.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/search.c b/src/search.c
index ee0fc211..d81ca2e2 100644
--- a/src/search.c
+++ b/src/search.c
@@ -1130,7 +1130,7 @@ static double search_cu(
                          lcu, tree_type,recon_luma,recon_chroma);
       // Set isp split cbfs here
       const int split_type = intra_search.pred_cu.intra.isp_mode;
-      const int split_num = split_type == ISP_MODE_NO_ISP ? 1 : uvg_get_isp_split_num(cu_width, cu_height, split_type);
+      const int split_num = split_type == ISP_MODE_NO_ISP ? 0 : uvg_get_isp_split_num(cu_width, cu_height, split_type);
       for (int i = 0; i < split_num; ++i) {
         cu_loc_t isp_loc;
         uvg_get_isp_split_loc(&isp_loc, x, y, cu_width, cu_height, i, split_type);

From 39f30563c54c5621ce77c8709fd5f48c0b3b1e86 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Wed, 24 Aug 2022 15:40:14 +0300
Subject: [PATCH 044/254] [isp] Fix chroma width error when writing cu loc.
 Remove redundant IPS mode checks.

---
 src/encode_coding_tree.c | 12 ++++++++----
 src/intra.c              |  2 +-
 src/search.c             |  4 ++--
 3 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index 175aeb4d..238e6fae 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -1355,12 +1355,16 @@ void uvg_encode_coding_tree(
   const cu_array_t* used_array = tree_type != UVG_CHROMA_T ? frame->cu_array : frame->chroma_cu_array;
   const cu_info_t *cur_cu   = uvg_cu_array_at_const(used_array, x, y);
 
-  const int cu_width = tree_type != UVG_CHROMA_T ? LCU_WIDTH >> depth : LCU_WIDTH_C >> depth;
-  const int cu_height = cu_width; // TODO: height for non-square blocks
+  const int width  = LCU_WIDTH >> depth;
+  const int height = width; // TODO: height for non-square blocks
+  cu_loc_t cu_loc;
+  uvg_cu_loc_ctor(&cu_loc, x, y, width, height);
+
+  const int cu_width  = tree_type != UVG_CHROMA_T ? cu_loc.width : cu_loc.chroma_width;
+  const int cu_height = tree_type != UVG_CHROMA_T ? cu_loc.height : cu_loc.chroma_height;
   const int half_cu  = cu_width >> 1;
 
-  cu_loc_t cu_loc;
-  uvg_cu_loc_ctor(&cu_loc, x, y, cu_width, cu_height);
+  
 
   const cu_info_t *left_cu  = NULL;
   if (x > 0) {
diff --git a/src/intra.c b/src/intra.c
index 251ab4d2..700657d9 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -1695,7 +1695,7 @@ void uvg_intra_recon_cu(
     // ISP split is done horizontally or vertically depending on ISP mode, 2 or 4 times depending on block dimensions.
     // Small blocks are split only twice.
     int split_type = search_data->pred_cu.intra.isp_mode;
-    int split_limit = split_type == ISP_MODE_NO_ISP ? 1 : uvg_get_isp_split_num(width, height, split_type);
+    int split_limit = uvg_get_isp_split_num(width, height, split_type);
 
     for (int i = 0; i < split_limit; ++i) {
       cu_loc_t loc;
diff --git a/src/search.c b/src/search.c
index d81ca2e2..0e2bf285 100644
--- a/src/search.c
+++ b/src/search.c
@@ -387,7 +387,7 @@ double uvg_cu_rd_cost_luma(const encoder_state_t *const state,
     }
     else {
       int split_type = pred_cu->intra.isp_mode;
-      int split_limit = split_type == ISP_MODE_NO_ISP ? 1 : uvg_get_isp_split_num(width, height, split_type);
+      int split_limit = uvg_get_isp_split_num(width, height, split_type);
 
       for (int i = 0; i < split_limit; ++i) {
         cu_loc_t loc;
@@ -626,7 +626,7 @@ static double cu_rd_cost_tr_split_accurate(
     }
     else {
       int split_type = pred_cu->intra.isp_mode;
-      int split_limit = split_type == ISP_MODE_NO_ISP ? 1 : uvg_get_isp_split_num(width, height, split_type);
+      int split_limit = uvg_get_isp_split_num(width, height, split_type);
 
       for (int i = 0; i < split_limit; ++i) {
         cu_loc_t loc;

From 10f9b2be26c7042f2ea7eb0f4f32972c3f6d1bd4 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Wed, 24 Aug 2022 16:00:53 +0300
Subject: [PATCH 045/254] [isp] Keep lfnst constraint up to date during search.

---
 src/search.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/search.c b/src/search.c
index 0e2bf285..2824994d 100644
--- a/src/search.c
+++ b/src/search.c
@@ -622,7 +622,7 @@ static double cu_rd_cost_tr_split_accurate(
     if (pred_cu->type == CU_INTER || pred_cu->intra.isp_mode == ISP_MODE_NO_ISP) {
       const coeff_t* coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, x_px, y_px)];
 
-      coeff_bits += uvg_get_coeff_cost(state, coeffs, NULL, width, height, 0, luma_scan_mode, pred_cu->tr_idx == MTS_SKIP);
+      coeff_bits += uvg_get_coeff_cost(state, coeffs, tr_cu, width, height, 0, luma_scan_mode, pred_cu->tr_idx == MTS_SKIP);
     }
     else {
       int split_type = pred_cu->intra.isp_mode;
@@ -639,7 +639,7 @@ static double cu_rd_cost_tr_split_accurate(
         // TODO: maybe just pass the cu_loc_t to these functions
         const coeff_t* coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, part_x, part_y)];
 
-        coeff_bits += uvg_get_coeff_cost(state, coeffs, NULL, part_w, part_h, 0, luma_scan_mode, pred_cu->tr_idx == MTS_SKIP);
+        coeff_bits += uvg_get_coeff_cost(state, coeffs, tr_cu, part_w, part_h, 0, luma_scan_mode, pred_cu->tr_idx == MTS_SKIP);
       }
     }
   }

From 2e8f008de4562013d5de09ffda0a5b359ac09dda Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Wed, 24 Aug 2022 16:15:03 +0300
Subject: [PATCH 046/254] [isp] Redo call hierarchy to include x, y
 coordinates.

---
 src/encode_coding_tree.c                      | 29 +++++------
 src/intra.c                                   |  9 ++--
 src/rdo.c                                     | 19 ++++---
 src/rdo.h                                     |  3 +-
 src/search.c                                  | 51 ++++++++++---------
 src/search_inter.c                            |  3 +-
 src/search_intra.c                            | 28 +++++-----
 src/strategies/avx2/encode_coding_tree-avx2.h |  4 +-
 .../generic/encode_coding_tree-generic.c      | 21 +++++---
 .../generic/encode_coding_tree-generic.h      |  3 +-
 src/strategies/strategies-encode.h            |  3 +-
 src/transform.c                               | 18 ++++---
 src/transform.h                               |  3 +-
 13 files changed, 104 insertions(+), 90 deletions(-)

diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index 238e6fae..3c2d1947 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -465,10 +465,8 @@ void uvg_encode_last_significant_xy(cabac_data_t * const cabac,
 
 static void encode_chroma_tu(
   encoder_state_t* const state,
-  int x,
-  int y,
+  cu_loc_t *cu_loc,
   int depth,
-  const uint8_t width_c,
   cu_info_t* cur_pu,
   int8_t* scan_idx,
   lcu_coeff_t* coeff,
@@ -476,9 +474,10 @@ static void encode_chroma_tu(
   enum
   uvg_tree_type tree_type)
 {
-  int height_c = width_c; // TODO: height for non-square blocks
-  int x_local = ((x >> (tree_type != UVG_CHROMA_T)) & ~3) % LCU_WIDTH_C;
-  int y_local = ((y >> (tree_type != UVG_CHROMA_T)) & ~3) % LCU_WIDTH_C;
+  int width_c = cu_loc->chroma_width;
+  //int height_c = cu_loc->chroma_height;
+  int x_local = ((cu_loc->x >> (tree_type != UVG_CHROMA_T)) & ~3) % LCU_WIDTH_C;
+  int y_local = ((cu_loc->y >> (tree_type != UVG_CHROMA_T)) & ~3) % LCU_WIDTH_C;
   cabac_data_t* const cabac = &state->cabac;
   *scan_idx = uvg_get_scan_order(cur_pu->type, cur_pu->intra.mode_chroma, depth);
   if(!joint_chroma){
@@ -486,13 +485,14 @@ static void encode_chroma_tu(
     const coeff_t *coeff_v = &coeff->v[xy_to_zorder(LCU_WIDTH_C, x_local, y_local)];
 
     if (cbf_is_set(cur_pu->cbf, depth, COLOR_U)) {
+      // ISP_TODO: do these checks need height?
       if(state->encoder_control->cfg.trskip_enable && width_c <= (1 << state->encoder_control->cfg.trskip_max_size)){
         cabac->cur_ctx = &cabac->ctx.transform_skip_model_chroma;
         // HEVC only supports transform_skip for Luma
         // TODO: transform skip for chroma blocks
         CABAC_BIN(cabac, (cur_pu->tr_skip >> COLOR_U) & 1, "transform_skip_flag");
       }
-      uvg_encode_coeff_nxn(state, &state->cabac, coeff_u, width_c, height_c, COLOR_U, *scan_idx, cur_pu, NULL);
+      uvg_encode_coeff_nxn(state, &state->cabac, coeff_u, cu_loc, COLOR_U, *scan_idx, cur_pu, NULL);
     }
 
     if (cbf_is_set(cur_pu->cbf, depth, COLOR_V)) {
@@ -500,7 +500,7 @@ static void encode_chroma_tu(
         cabac->cur_ctx = &cabac->ctx.transform_skip_model_chroma;
         CABAC_BIN(cabac, (cur_pu->tr_skip >> COLOR_V) & 1, "transform_skip_flag");
       }
-      uvg_encode_coeff_nxn(state, &state->cabac, coeff_v, width_c, height_c, COLOR_V, *scan_idx, cur_pu, NULL);
+      uvg_encode_coeff_nxn(state, &state->cabac, coeff_v, cu_loc, COLOR_V, *scan_idx, cur_pu, NULL);
     }
   }
   else {
@@ -509,7 +509,7 @@ static void encode_chroma_tu(
       cabac->cur_ctx = &cabac->ctx.transform_skip_model_chroma;
       CABAC_BIN(cabac, 0, "transform_skip_flag");
     }
-    uvg_encode_coeff_nxn(state, &state->cabac, coeff_uv, width_c, height_c, COLOR_V, *scan_idx, cur_pu, NULL);
+    uvg_encode_coeff_nxn(state, &state->cabac, coeff_uv, cu_loc, COLOR_V, *scan_idx, cur_pu, NULL);
     
   }
 }
@@ -561,8 +561,7 @@ static void encode_transform_unit(
       uvg_encode_coeff_nxn(state,
                            cabac,
                            coeff_y,
-                           width,
-                           height,
+                           cu_loc,
                            0,
                            scan_idx,
                            (cu_info_t * )cur_pu,
@@ -589,7 +588,7 @@ static void encode_transform_unit(
                         cbf_is_set(cur_pu->cbf, depth, COLOR_V);
   if ((chroma_cbf_set || joint_chroma) && last_split) {
     //Need to drop const to get lfnst constraints
-    encode_chroma_tu(state, x, y, depth, width_c, (cu_info_t*)cur_pu, &scan_idx, coeff, joint_chroma, tree_type);
+    encode_chroma_tu(state, cu_loc, depth, (cu_info_t*)cur_pu, &scan_idx, coeff, joint_chroma, tree_type);
   }
 }
 
@@ -1630,12 +1629,12 @@ void uvg_encode_coding_tree(
       int split_limit = split_type == ISP_MODE_NO_ISP ? 1 : uvg_get_isp_split_num(cu_width, cu_height, split_type);
 
       for (int i = 0; i < split_limit; ++i) {
-        cu_loc_t loc;
-        uvg_get_isp_split_loc(&loc, x, y, cu_width, cu_height, i, split_type);
+        cu_loc_t split_loc;
+        uvg_get_isp_split_loc(&split_loc, x, y, cu_width, cu_height, i, split_type);
 
         // Check if last split to write chroma
         bool last_split = (i + 1) == split_limit;
-        encode_transform_coeff(state, &loc, depth, 0, 0, 0, 0, coeff, tree_type, last_split);
+        encode_transform_coeff(state, &split_loc, depth, 0, 0, 0, 0, coeff, tree_type, last_split);
       }
     }
 
diff --git a/src/intra.c b/src/intra.c
index 700657d9..123b82ae 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -1520,6 +1520,7 @@ int uvg_get_isp_split_num(const int width, const int height, const int split_typ
 void uvg_get_isp_split_loc(cu_loc_t *loc, const int x, const int y, const int block_w, const int block_h, const int split_idx, const int split_type)
 {
   assert((split_idx >= 0 && split_idx <= 3) && "ISP split index must be in [0, 3].");
+  assert((split_type == ISP_MODE_NO_ISP && split_idx > 0) && "Trying to ISP split when split type = NO_ISP.");
   int part_dim = block_w;
   if (split_type != ISP_MODE_NO_ISP) {
     part_dim = uvg_get_isp_split_dim(block_w, block_h, split_type);
@@ -1698,12 +1699,12 @@ void uvg_intra_recon_cu(
     int split_limit = uvg_get_isp_split_num(width, height, split_type);
 
     for (int i = 0; i < split_limit; ++i) {
-      cu_loc_t loc;
-      uvg_get_isp_split_loc(&loc, x, y, width, height, i, split_type);
+      cu_loc_t split_loc;
+      uvg_get_isp_split_loc(&split_loc, x, y, width, height, i, split_type);
 
-      intra_recon_tb_leaf(state, &loc, lcu, COLOR_Y, search_data, tree_type);
+      intra_recon_tb_leaf(state, &split_loc, lcu, COLOR_Y, search_data, tree_type);
       uvg_quantize_lcu_residual(state, true, false, false,
-        &loc, depth, cur_cu, lcu,
+        &split_loc, depth, cur_cu, lcu,
         false, tree_type);
       search_data->best_isp_cbfs |= cbf_is_set(cur_cu->cbf, depth, COLOR_Y) << (i++);
     }
diff --git a/src/rdo.c b/src/rdo.c
index 160cc0bc..c467dc94 100644
--- a/src/rdo.c
+++ b/src/rdo.c
@@ -297,15 +297,17 @@ out:
 static INLINE double get_coeff_cabac_cost(
   const encoder_state_t * const state,
   const coeff_t *coeff,
-  int32_t width,
-  int32_t height,
+  cu_loc_t *cu_loc,
   color_t color,
   int8_t scan_mode,
   int8_t tr_skip,
   cu_info_t* cur_tu)
 {
+  const int width  = cu_loc->width;
+  const int height = cu_loc->height;
   // Make sure there are coeffs present
   bool found = false;
+  // ISP_TODO: this needs to be two separate x, y loops?
   for (int i = 0; i < width * height; i++) {
     if (coeff[i] != 0) {
       found = 1;
@@ -331,8 +333,7 @@ static INLINE double get_coeff_cabac_cost(
     uvg_encode_coeff_nxn((encoder_state_t*) state,
                          &cabac_copy,
                          coeff,
-                         width,
-                         height,
+                         cu_loc,
                          color,
                          scan_mode,
                          cur_tu,                   
@@ -394,8 +395,7 @@ double uvg_get_coeff_cost(
   const encoder_state_t * const state,
   const coeff_t *coeff,
   cu_info_t* cur_tu,
-  int32_t width,
-  int32_t height,
+  cu_loc_t *cu_loc,
   color_t color,
   int8_t scan_mode,
   int8_t tr_skip)
@@ -403,6 +403,9 @@ double uvg_get_coeff_cost(
   uint8_t save_cccs = state->encoder_control->cfg.fastrd_sampling_on;
   uint8_t check_accuracy = state->encoder_control->cfg.fastrd_accuracy_check_on;
 
+  const int width  = color == COLOR_Y ? cu_loc->width  : cu_loc->chroma_width;
+  const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
+
   if (state->qp < state->encoder_control->cfg.fast_residual_cost_limit &&
       state->qp < MAX_FAST_COEFF_COST_QP && !tr_skip) {
     // TODO: do we need to assert(0) out of the fast-estimation branch if we
@@ -415,13 +418,13 @@ double uvg_get_coeff_cost(
       uint64_t weights = uvg_fast_coeff_get_weights(state);
       uint32_t fast_cost = uvg_fast_coeff_cost(coeff, width, height, weights);
       if (check_accuracy) {
-        double ccc = get_coeff_cabac_cost(state, coeff, width, height, color, scan_mode, tr_skip, cur_tu);
+        double ccc = get_coeff_cabac_cost(state, coeff, cu_loc, color, scan_mode, tr_skip, cur_tu);
         save_accuracy(state->qp, ccc, fast_cost);
       }
       return fast_cost;
     }
   } else {
-    double ccc = get_coeff_cabac_cost(state, coeff, width, height, color, scan_mode, tr_skip, cur_tu);
+    double ccc = get_coeff_cabac_cost(state, coeff, cu_loc, color, scan_mode, tr_skip, cur_tu);
     if (save_cccs) {
       save_ccc(state->qp, coeff, width * width, ccc);
     }
diff --git a/src/rdo.h b/src/rdo.h
index 88a6548b..c9b88df3 100644
--- a/src/rdo.h
+++ b/src/rdo.h
@@ -73,8 +73,7 @@ double uvg_get_coeff_cost(
   const encoder_state_t * const state,
   const coeff_t *coeff,
   cu_info_t* cur_tu,
-  int32_t width,
-  int32_t height,
+  cu_loc_t *cu_loc,
   color_t color,
   int8_t scan_mode,
   int8_t tr_skip);
diff --git a/src/search.c b/src/search.c
index 2824994d..293a807f 100644
--- a/src/search.c
+++ b/src/search.c
@@ -316,6 +316,9 @@ double uvg_cu_rd_cost_luma(const encoder_state_t *const state,
   const int skip_residual_coding = pred_cu->skipped || (pred_cu->type != CU_INTRA && pred_cu->cbf == 0);
   cabac_data_t* cabac = (cabac_data_t *)&state->search_cabac;
 
+  cu_loc_t loc;
+  uvg_cu_loc_ctor(&loc, x_px, y_px, width, height);
+
   // cur_cu is used for TU parameters.
   cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, x_px, y_px);
 
@@ -383,24 +386,22 @@ double uvg_cu_rd_cost_luma(const encoder_state_t *const state,
     if (pred_cu->type == CU_INTER || pred_cu->intra.isp_mode == ISP_MODE_NO_ISP) {
       const coeff_t* coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, x_px, y_px)];
 
-      coeff_bits += uvg_get_coeff_cost(state, coeffs, NULL, width, height, 0, luma_scan_mode, pred_cu->tr_idx == MTS_SKIP);
+      coeff_bits += uvg_get_coeff_cost(state, coeffs, NULL, &loc, 0, luma_scan_mode, pred_cu->tr_idx == MTS_SKIP);
     }
     else {
       int split_type = pred_cu->intra.isp_mode;
       int split_limit = uvg_get_isp_split_num(width, height, split_type);
 
       for (int i = 0; i < split_limit; ++i) {
-        cu_loc_t loc;
-        uvg_get_isp_split_loc(&loc, x_px, y_px, width, height, i, split_type);
-        const int part_x = loc.x;
-        const int part_y = loc.y;
-        const int part_w = loc.width;
-        const int part_h = loc.height;
+        cu_loc_t split_loc;
+        uvg_get_isp_split_loc(&split_loc, x_px, y_px, width, height, i, split_type);
+        const int part_x = split_loc.x;
+        const int part_y = split_loc.y;
 
         // TODO: maybe just pass the cu_loc_t to these functions
         const coeff_t* coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, part_x, part_y)];
 
-        coeff_bits += uvg_get_coeff_cost(state, coeffs, NULL, part_w, part_h, 0, luma_scan_mode, pred_cu->tr_idx == MTS_SKIP);
+        coeff_bits += uvg_get_coeff_cost(state, coeffs, NULL, &split_loc, 0, luma_scan_mode, pred_cu->tr_idx == MTS_SKIP);
       }
     }
   }
@@ -421,6 +422,9 @@ double uvg_cu_rd_cost_chroma(const encoder_state_t *const state,
   cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, x_px, y_px);
   const int skip_residual_coding = pred_cu->skipped || (pred_cu->type != CU_INTRA && pred_cu->cbf == 0);
 
+  cu_loc_t loc;
+  uvg_cu_loc_ctor(&loc, x_px, y_px, width, height);
+
   double tr_tree_bits = 0;
   double coeff_bits = 0;
 
@@ -491,11 +495,11 @@ double uvg_cu_rd_cost_chroma(const encoder_state_t *const state,
     const int index = xy_to_zorder(LCU_WIDTH_C, lcu_px.x, lcu_px.y);
 
     if((pred_cu->joint_cb_cr & 3) == 0){
-      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.u[index], NULL, width, height, 2, scan_order, 0);
-      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.v[index], NULL, width, height, 2, scan_order, 0);
+      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.u[index], NULL, &loc, 2, scan_order, 0);
+      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.v[index], NULL, &loc, 2, scan_order, 0);
     }
     else {
-      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.joint_uv[index], NULL, width, height, 2, scan_order, 0);
+      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.joint_uv[index], NULL, &loc, 2, scan_order, 0);
       
     }
   }
@@ -518,6 +522,9 @@ static double cu_rd_cost_tr_split_accurate(
   const int width = LCU_WIDTH >> depth;
   const int height = width; // TODO: height for non-square blocks
 
+  cu_loc_t loc;
+  uvg_cu_loc_ctor(&loc, x_px, y_px, width, height);
+
   const int skip_residual_coding = pred_cu->skipped || (pred_cu->type != CU_INTRA && pred_cu->cbf == 0);
   // cur_cu is used for TU parameters.
   cu_info_t* const tr_cu = LCU_GET_CU_AT_PX(lcu, x_px, y_px);
@@ -622,24 +629,22 @@ static double cu_rd_cost_tr_split_accurate(
     if (pred_cu->type == CU_INTER || pred_cu->intra.isp_mode == ISP_MODE_NO_ISP) {
       const coeff_t* coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, x_px, y_px)];
 
-      coeff_bits += uvg_get_coeff_cost(state, coeffs, tr_cu, width, height, 0, luma_scan_mode, pred_cu->tr_idx == MTS_SKIP);
+      coeff_bits += uvg_get_coeff_cost(state, coeffs, tr_cu, &loc, 0, luma_scan_mode, pred_cu->tr_idx == MTS_SKIP);
     }
     else {
       int split_type = pred_cu->intra.isp_mode;
       int split_limit = uvg_get_isp_split_num(width, height, split_type);
 
       for (int i = 0; i < split_limit; ++i) {
-        cu_loc_t loc;
-        uvg_get_isp_split_loc(&loc, x_px, y_px, width, height, i, split_type);
-        const int part_x = loc.x;
-        const int part_y = loc.y;
-        const int part_w = loc.width;
-        const int part_h = loc.height;
+        cu_loc_t split_loc;
+        uvg_get_isp_split_loc(&split_loc, x_px, y_px, width, height, i, split_type);
+        const int part_x = split_loc.x;
+        const int part_y = split_loc.y;
 
         // TODO: maybe just pass the cu_loc_t to these functions
         const coeff_t* coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, part_x, part_y)];
 
-        coeff_bits += uvg_get_coeff_cost(state, coeffs, tr_cu, part_w, part_h, 0, luma_scan_mode, pred_cu->tr_idx == MTS_SKIP);
+        coeff_bits += uvg_get_coeff_cost(state, coeffs, tr_cu, &split_loc, 0, luma_scan_mode, pred_cu->tr_idx == MTS_SKIP);
       }
     }
   }
@@ -691,8 +696,8 @@ static double cu_rd_cost_tr_split_accurate(
       if(chroma_can_use_tr_skip && cb_flag_v) {
         CABAC_FBITS_UPDATE(cabac, &cabac->ctx.transform_skip_model_chroma, tr_cu->tr_skip & 4, tr_tree_bits, "transform_skip_flag");        
       }
-      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.u[index], tr_cu, chroma_width, chroma_height, COLOR_U, scan_order, tr_cu->tr_skip & 2);
-      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.v[index], tr_cu, chroma_width, chroma_height, COLOR_V, scan_order, tr_cu->tr_skip & 4);
+      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.u[index], tr_cu, &loc, COLOR_U, scan_order, tr_cu->tr_skip & 2);
+      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.v[index], tr_cu, &loc, COLOR_V, scan_order, tr_cu->tr_skip & 4);
       
     }
     else {
@@ -709,11 +714,11 @@ static double cu_rd_cost_tr_split_accurate(
       if (chroma_can_use_tr_skip) {
         CABAC_FBITS_UPDATE(cabac, &cabac->ctx.transform_skip_model_chroma, tr_cu->tr_skip & 2, tr_tree_bits, "transform_skip_flag");
       }
-      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.joint_uv[index], tr_cu, chroma_width, chroma_height, COLOR_U, scan_order, 0);
+      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.joint_uv[index], tr_cu, &loc, COLOR_U, scan_order, 0);
     }
   }
 
-  if (uvg_is_lfnst_allowed(state, tr_cu, width, width, x_px, y_px, tree_type, depth == 4 || tree_type == UVG_CHROMA_T ? COLOR_UV : COLOR_Y, lcu)) {
+  if (uvg_is_lfnst_allowed(state, tr_cu, width, height, x_px, y_px, tree_type, depth == 4 || tree_type == UVG_CHROMA_T ? COLOR_UV : COLOR_Y, lcu)) {
     const int lfnst_idx = (depth != 4 && tree_type != UVG_CHROMA_T) ? tr_cu->lfnst_idx : tr_cu->cr_lfnst_idx;
     CABAC_FBITS_UPDATE(
       cabac,
diff --git a/src/search_inter.c b/src/search_inter.c
index e2013c6a..93598ff2 100644
--- a/src/search_inter.c
+++ b/src/search_inter.c
@@ -2243,8 +2243,7 @@ void uvg_cu_cost_inter_rd2(encoder_state_t * const state,
       depth,
       lcu,
       &cabac_copy,
-      width,
-      width,
+      &loc,
       index,
       0,
       cur_cu,
diff --git a/src/search_intra.c b/src/search_intra.c
index cf25936d..5f0b3669 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -1495,11 +1495,16 @@ int8_t uvg_search_intra_chroma_rdo(
 {
   const bool reconstruct_chroma = (depth != 4) || (x_px & 4 && y_px & 4);
 
-  int log2_width = MAX(LOG2_LCU_WIDTH - depth - 1, 2);
-  int8_t width = 1 << log2_width;
-  int8_t height = 1 << log2_width;
-  const cu_loc_t loc = { x_px & ~7, y_px & ~7, width, height, width, height };
+  const int luma_width  = LCU_WIDTH >> depth;
+  const int luma_height = LCU_WIDTH >> depth; // TODO: height
 
+  int log2_width = MAX(LOG2_LCU_WIDTH - depth - 1, 2);
+  
+  cu_loc_t loc;
+  uvg_cu_loc_ctor(&loc, x_px & ~7, y_px & ~7, luma_width, luma_height);
+
+  const int chroma_width  = loc.chroma_width;
+  const int chroma_height = loc.chroma_height;
   uvg_intra_references refs[2];
   const vector2d_t luma_px = { x_px & ~7, y_px & ~7 };
   const vector2d_t pic_px = {
@@ -1576,26 +1581,25 @@ int8_t uvg_search_intra_chroma_rdo(
             &lcu->ref.u[offset],
             u_pred,
             u_resi,
-            width,
-            height,
+            chroma_width,
+            chroma_height,
             LCU_WIDTH_C,
-            width);
+            chroma_width);
           uvg_generate_residual(
             &lcu->ref.v[offset],
             v_pred,
             v_resi,
-            width,
-            height,
+            chroma_width,
+            chroma_height,
             LCU_WIDTH_C,
-            width);
+            chroma_width);
           uvg_chorma_ts_out_t chorma_ts_out;
           uvg_chroma_transform_search(
             state,
             depth,
             lcu,
             &temp_cabac,
-            width,
-            height,
+            &loc,
             offset,
             mode,
             pred_cu,
diff --git a/src/strategies/avx2/encode_coding_tree-avx2.h b/src/strategies/avx2/encode_coding_tree-avx2.h
index 9fc75c8a..fa4ec8d5 100644
--- a/src/strategies/avx2/encode_coding_tree-avx2.h
+++ b/src/strategies/avx2/encode_coding_tree-avx2.h
@@ -38,14 +38,14 @@
  * Functions for writing the coding quadtree and related syntax.
  */
 
+#include "cu.h"
 #include "encoderstate.h"
 #include "global.h"
 
 void uvg_encode_coeff_nxn_avx2(encoder_state_t * const state,
                                cabac_data_t * const cabac,
                                const coeff_t *coeff,
-                               uint8_t width,
-                               uint8_t height,
+                               cu_loc_t *loc,
                                uint8_t type,
                                int8_t scan_mode,
                                int8_t tr_skip,
diff --git a/src/strategies/generic/encode_coding_tree-generic.c b/src/strategies/generic/encode_coding_tree-generic.c
index 242e86bc..756cd6d6 100644
--- a/src/strategies/generic/encode_coding_tree-generic.c
+++ b/src/strategies/generic/encode_coding_tree-generic.c
@@ -54,13 +54,16 @@
 void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
   cabac_data_t * const cabac,
   const coeff_t *coeff,
-  uint8_t width,
-  uint8_t height,
+  cu_loc_t *cu_loc,
   uint8_t color,
   int8_t scan_mode,
   cu_info_t* cur_cu,
-  double* bits_out) {
-
+  double* bits_out) 
+{
+  const int x = cu_loc->x;
+  const int y = cu_loc->y;
+  const int width  = color == COLOR_Y ? cu_loc->width  : cu_loc->chroma_width;
+  const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
   //const encoder_control_t * const encoder = state->encoder_control;
   //int c1 = 1;
   uint8_t last_coeff_x = 0;
@@ -91,10 +94,12 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
   unsigned scan_cg_last = (unsigned)-1;
   unsigned scan_pos_last = (unsigned)-1;
 
-  for (int i = 0; i < width * height; i++) {
-    if (coeff[scan[i]]) {
-      scan_pos_last = i;
-      sig_coeffgroup_flag[scan_cg[i >> log2_cg_size]] = 1;
+  for (int j = 0; j < height; j++) {
+    for (int i = 0; i < width; i++) {
+      if (coeff[scan[i + j * width]]) {
+        scan_pos_last = i + j * width;
+        sig_coeffgroup_flag[scan_cg[(i + j * width) >> log2_cg_size]] = 1;
+      }
     }
   }
   scan_cg_last = scan_pos_last >> log2_cg_size;
diff --git a/src/strategies/generic/encode_coding_tree-generic.h b/src/strategies/generic/encode_coding_tree-generic.h
index bcf51f15..09255deb 100644
--- a/src/strategies/generic/encode_coding_tree-generic.h
+++ b/src/strategies/generic/encode_coding_tree-generic.h
@@ -44,8 +44,7 @@
 void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
                                   cabac_data_t * const cabac,
                                   const coeff_t *coeff,
-                                  uint8_t width,
-                                  uint8_t height,
+                                  cu_loc_t *loc,
                                   uint8_t color,
                                   int8_t scan_mode,
                                   cu_info_t* cur_cu,
diff --git a/src/strategies/strategies-encode.h b/src/strategies/strategies-encode.h
index f503eb73..2bffacca 100644
--- a/src/strategies/strategies-encode.h
+++ b/src/strategies/strategies-encode.h
@@ -49,8 +49,7 @@
 typedef unsigned (encode_coeff_nxn_func)(encoder_state_t * const state,
                                          cabac_data_t * const cabac,
                                          const coeff_t *coeff,
-                                         uint8_t width,
-                                         uint8_t heigth,
+                                         cu_loc_t *loc,
                                          uint8_t color,
                                          int8_t scan_mode,
                                          cu_info_t* cur_cu,
diff --git a/src/transform.c b/src/transform.c
index ffe3c05b..b9fabd65 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -483,8 +483,7 @@ void uvg_chroma_transform_search(
   int depth,
   lcu_t* const lcu,
   cabac_data_t* temp_cabac,
-  int8_t width,
-  int8_t height,
+  cu_loc_t *cu_loc,
   const int offset,
   const uint8_t mode,
   cu_info_t* pred_cu,
@@ -499,6 +498,9 @@ void uvg_chroma_transform_search(
   ALIGNED(64) uint8_t u_recon[LCU_WIDTH_C * LCU_WIDTH_C * 5];
   ALIGNED(64) coeff_t v_coeff[LCU_WIDTH_C * LCU_WIDTH_C * 2];
   ALIGNED(64) uint8_t v_recon[LCU_WIDTH_C * LCU_WIDTH_C * 5];
+  const int width  = cu_loc->chroma_width;
+  const int height = cu_loc->chroma_height;
+
   uvg_transform2d(
     state->encoder_control, u_resi, u_coeff, width, height, COLOR_U, pred_cu
   );
@@ -689,8 +691,7 @@ void uvg_chroma_transform_search(
         state,
         u_quant_coeff,
         pred_cu,
-        width,
-        height,
+        cu_loc,
         COLOR_U,
         scan_order,
         transforms[i] == CHROMA_TS);
@@ -706,8 +707,7 @@ void uvg_chroma_transform_search(
         state,
         v_quant_coeff,
         pred_cu,
-        width,
-        height,
+        cu_loc,
         COLOR_V,
         scan_order,
         transforms[i] == CHROMA_TS);
@@ -1161,6 +1161,8 @@ static void quantize_tr_residual(
   // Pointers to current location in arrays with quantized coefficients.
   coeff_t *coeff = NULL;
 
+  // ISP_TODO: use temp coeff array size MAX_TR_WIDTH^2 instead of coeff pointers
+  // ISP_TODO: inside temp coeff array, entries are in the old order. P�TK�
   switch (color) {
     case COLOR_Y:
       pred  = &lcu->rec.y[offset];
@@ -1272,9 +1274,9 @@ static void quantize_tr_residual(
 
   cbf_clear(&cur_pu->cbf, depth, color);
   if (has_coeffs) {
+    // ISP_TODO: copy coeffs into CU order instead of p�tk�
     cbf_set(&cur_pu->cbf, depth, color);
-  }
-
+  } // ISP_TODO: if no coeffs, mem set width * height amount of coeffs to zero
 }
 
 /**
diff --git a/src/transform.h b/src/transform.h
index f8f81da6..69a9450f 100644
--- a/src/transform.h
+++ b/src/transform.h
@@ -103,8 +103,7 @@ void uvg_chroma_transform_search(
   int depth,
   lcu_t* const lcu,
   cabac_data_t* temp_cabac,
-  int8_t width,
-  int8_t height,
+  cu_loc_t *cu_loc,
   const int offset,
   const uint8_t mode,
   cu_info_t* pred_cu,

From 0ae71feae4a876d6c74fecb03df32a3c97537eae Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Wed, 24 Aug 2022 17:12:50 +0300
Subject: [PATCH 047/254] [isp] Fix assert.

---
 src/intra.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/intra.c b/src/intra.c
index 123b82ae..0c8150ea 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -1520,7 +1520,7 @@ int uvg_get_isp_split_num(const int width, const int height, const int split_typ
 void uvg_get_isp_split_loc(cu_loc_t *loc, const int x, const int y, const int block_w, const int block_h, const int split_idx, const int split_type)
 {
   assert((split_idx >= 0 && split_idx <= 3) && "ISP split index must be in [0, 3].");
-  assert((split_type == ISP_MODE_NO_ISP && split_idx > 0) && "Trying to ISP split when split type = NO_ISP.");
+  assert((split_type == ISP_MODE_NO_ISP && split_idx == 0) && "Trying to ISP split when split type = NO_ISP.");
   int part_dim = block_w;
   if (split_type != ISP_MODE_NO_ISP) {
     part_dim = uvg_get_isp_split_dim(block_w, block_h, split_type);

From 69dcb04c99dc0854c73026ff4542b7b5c3f3bce1 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Wed, 24 Aug 2022 18:45:13 +0300
Subject: [PATCH 048/254] [isp] Use temporary coeff array when quantizing
 coeffs. After deriving coeffs, copy temp coeffs from linear order to correct
 arrays with cu order.

---
 src/transform.c | 56 ++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 42 insertions(+), 14 deletions(-)

diff --git a/src/transform.c b/src/transform.c
index b9fabd65..c5a38475 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -1107,6 +1107,19 @@ int uvg_quantize_residual_trskip(
   return best->has_coeffs;
 }
 
+
+static INLINE int translate_to_cu_order_idx(const int lcu_x, const int lcu_y, const int block_w, const int block_h, const int linear_idx)
+{
+  // ISP_TODO: get rid of all there temp variables after making sure this works
+  const int start_idx = lcu_x + lcu_y * LCU_WIDTH;
+  const int offset_x = linear_idx % block_w;
+  const int local_y = linear_idx / block_h;
+  const int offset_y = local_y * LCU_WIDTH;
+
+  return (start_idx + offset_x + offset_y);
+}
+
+
 /**
  * Calculate the residual coefficients for a single TU.
  *
@@ -1124,6 +1137,7 @@ static void quantize_tr_residual(
 {
   const int x = cu_loc->x;
   const int y = cu_loc->y;
+
   const uvg_config *cfg    = &state->encoder_control->cfg;
   const int32_t shift      = color == COLOR_Y ? 0 : 1;
   const vector2d_t lcu_px  = { SUB_SCU(x) >> shift, SUB_SCU(y) >> shift};
@@ -1158,26 +1172,27 @@ static void quantize_tr_residual(
   uvg_pixel *pred = NULL;
   // Pointers to current location in arrays with reference.
   const uvg_pixel *ref = NULL;
-  // Pointers to current location in arrays with quantized coefficients.
-  coeff_t *coeff = NULL;
+  // Temp coeff array
+  coeff_t coeff[TR_MAX_WIDTH * TR_MAX_WIDTH];
+  coeff_t *dst_coeff = NULL;
 
   // ISP_TODO: use temp coeff array size MAX_TR_WIDTH^2 instead of coeff pointers
-  // ISP_TODO: inside temp coeff array, entries are in the old order. P�TK�
+  // ISP_TODO: inside temp coeff array, entries are in the old linear order. P�TK�
   switch (color) {
     case COLOR_Y:
-      pred  = &lcu->rec.y[offset];
-      ref   = &lcu->ref.y[offset];
-      coeff = &lcu->coeff.y[z_index];
+      pred      = &lcu->rec.y[offset];
+      ref       = &lcu->ref.y[offset];
+      dst_coeff = &lcu->coeff.y;
       break;
     case COLOR_U:
-      pred = &lcu->rec.u[offset];
-      ref  = &lcu->ref.u[offset];
-      coeff = &lcu->coeff.u[z_index];
+      pred      = &lcu->rec.u[offset];
+      ref       = &lcu->ref.u[offset];
+      dst_coeff = &lcu->coeff.u;
       break;
     case COLOR_V:
-      pred = &lcu->rec.v[offset];
-      ref  = &lcu->ref.v[offset];
-      coeff = &lcu->coeff.v[z_index];
+      pred      = &lcu->rec.v[offset];
+      ref       = &lcu->ref.v[offset];
+      dst_coeff = &lcu->coeff.v;
       break;
     default:
       break;
@@ -1274,9 +1289,22 @@ static void quantize_tr_residual(
 
   cbf_clear(&cur_pu->cbf, depth, color);
   if (has_coeffs) {
-    // ISP_TODO: copy coeffs into CU order instead of p�tk�
+    const int coeffs_to_copy = tr_width * tr_height;
+    for (int i = 0; i < coeffs_to_copy; ++i) {
+      const coeff_t c = coeff[i];
+      const idx = translate_to_cu_order_idx(lcu_px.x, lcu_px.y, tr_width, tr_height, i);
+      dst_coeff[idx] = c;
+    }
     cbf_set(&cur_pu->cbf, depth, color);
-  } // ISP_TODO: if no coeffs, mem set width * height amount of coeffs to zero
+  }
+  else {
+    // ISP_TODO: if no coeffs, mem set width * height amount of coeffs to zero
+    int idx = lcu_px.x + lcu_px.y * LCU_WIDTH;
+    for (int j = 0; j < tr_height; ++j) {
+      memset(dst_coeff[idx], 0, (sizeof(coeff_t) * tr_width));
+      idx += LCU_WIDTH;
+    }
+  }
 }
 
 /**

From b8506c757c6c3111e44f92a533427aae8b37f7ee Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Thu, 25 Aug 2022 14:54:42 +0300
Subject: [PATCH 049/254] [isp] Convert functions to handle new coeff array
 order. Add function for getting coeff array subset. Fix assert.

---
 src/cu.h                                      |  6 ++-
 src/encode_coding_tree.c                      | 43 ++++++++++++++++---
 src/encode_coding_tree.h                      |  2 +
 src/intra.c                                   |  2 +-
 src/rdo.c                                     | 20 ++++++---
 src/search.c                                  | 27 +++++++-----
 .../generic/encode_coding_tree-generic.c      | 14 +++---
 src/transform.c                               | 36 ++++------------
 8 files changed, 92 insertions(+), 58 deletions(-)

diff --git a/src/cu.h b/src/cu.h
index f5eeb5e6..dae446c4 100644
--- a/src/cu.h
+++ b/src/cu.h
@@ -415,9 +415,11 @@ void uvg_cu_array_copy_from_lcu(cu_array_t* dst, int dst_x, int dst_y, const lcu
  */
 static INLINE void copy_coeffs(const coeff_t *__restrict src,
                                coeff_t *__restrict dest,
-                               size_t width, size_t height)
+                               size_t width, size_t height, const int lcu_width)
 {
-  memcpy(dest, src, width * height * sizeof(coeff_t));
+  for (int j = 0; j < height; ++j) {
+    memcpy(dest + j * lcu_width, src + j * lcu_width, width * sizeof(coeff_t));
+  }
 }
 
 
diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index 3c2d1947..2421f7a0 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -417,7 +417,7 @@ void uvg_encode_last_significant_xy(cabac_data_t * const cabac,
   //ToDo: own ctx_offset and shift for X and Y 
   uint8_t ctx_offset_x = type ? 0 : prefix_ctx[index_x];
   uint8_t ctx_offset_y = type ? 0 : prefix_ctx[index_y];
-  uint8_t shift_x = type ? CLIP(0, 2, width>>3) : (index_x+1)>>2;
+  uint8_t shift_x = type ? CLIP(0, 2, width >> 3) : (index_x + 1) >> 2;
   uint8_t shift_y = type ? CLIP(0, 2, width >> 3) : (index_y + 1) >> 2;
   double bits = 0;
 
@@ -481,11 +481,15 @@ static void encode_chroma_tu(
   cabac_data_t* const cabac = &state->cabac;
   *scan_idx = uvg_get_scan_order(cur_pu->type, cur_pu->intra.mode_chroma, depth);
   if(!joint_chroma){
-    const coeff_t *coeff_u = &coeff->u[xy_to_zorder(LCU_WIDTH_C, x_local, y_local)];
-    const coeff_t *coeff_v = &coeff->v[xy_to_zorder(LCU_WIDTH_C, x_local, y_local)];
+    // const coeff_t *coeff_u = &coeff->u[xy_to_zorder(LCU_WIDTH_C, x_local, y_local)];
+    // const coeff_t *coeff_v = &coeff->v[xy_to_zorder(LCU_WIDTH_C, x_local, y_local)];
+    coeff_t coeff_u[TR_MAX_WIDTH * TR_MAX_WIDTH];
+    coeff_t coeff_v[TR_MAX_WIDTH * TR_MAX_WIDTH];
+    uvg_get_sub_coeff(coeff_u, coeff->u, x_local, y_local, cu_loc->chroma_width, cu_loc->chroma_height, LCU_WIDTH_C);
+    uvg_get_sub_coeff(coeff_v, coeff->v, x_local, y_local, cu_loc->chroma_width, cu_loc->chroma_height, LCU_WIDTH_C);
 
     if (cbf_is_set(cur_pu->cbf, depth, COLOR_U)) {
-      // ISP_TODO: do these checks need height?
+      // TODO: height for this check and the others below
       if(state->encoder_control->cfg.trskip_enable && width_c <= (1 << state->encoder_control->cfg.trskip_max_size)){
         cabac->cur_ctx = &cabac->ctx.transform_skip_model_chroma;
         // HEVC only supports transform_skip for Luma
@@ -504,7 +508,9 @@ static void encode_chroma_tu(
     }
   }
   else {
-    const coeff_t *coeff_uv = &coeff->joint_uv[xy_to_zorder(LCU_WIDTH_C, x_local, y_local)];
+    // const coeff_t *coeff_uv = &coeff->joint_uv[xy_to_zorder(LCU_WIDTH_C, x_local, y_local)];
+    const coeff_t coeff_uv[TR_MAX_WIDTH * TR_MAX_WIDTH];
+    uvg_get_sub_coeff(coeff_uv, coeff->joint_uv, x_local, y_local, cu_loc->chroma_width, cu_loc->chroma_height, LCU_WIDTH_C);
     if (state->encoder_control->cfg.trskip_enable && width_c <= (1 << state->encoder_control->cfg.trskip_max_size)) {
       cabac->cur_ctx = &cabac->ctx.transform_skip_model_chroma;
       CABAC_BIN(cabac, 0, "transform_skip_flag");
@@ -544,7 +550,9 @@ static void encode_transform_unit(
   if (cbf_y && !only_chroma) {
     int x_local = x % LCU_WIDTH;
     int y_local = y % LCU_WIDTH;
-    const coeff_t *coeff_y = &coeff->y[xy_to_zorder(LCU_WIDTH, x_local, y_local)];
+    // const coeff_t *coeff_y = &coeff->y[xy_to_zorder(LCU_WIDTH, x_local, y_local)];
+    coeff_t coeff_y[TR_MAX_WIDTH * TR_MAX_WIDTH];
+    uvg_get_sub_coeff(coeff_y, coeff->y, x_local, y_local, width, height, LCU_WIDTH);
 
     // CoeffNxN
     // Residual Coding
@@ -1849,3 +1857,26 @@ void uvg_encode_mvd(encoder_state_t * const state,
 
   if(bits_out) *bits_out = temp_bits_out;
 }
+
+
+/**
+ * \brief Get a subset of LCU coeff array.
+ *
+ * \param dst         Destination array. Should be coeff_t [32*32].
+ * \param src         Coeff LCU array.
+ * \param lcu_x       Local LCU x coordinate.
+ * \param lcu_y       Local LCU y coordinate.
+ * \param width       Block width.
+ * \param height      Block height.
+ * \param lcu_width   LCU_WIDTH for luma, LCU_WIDTH_C for chroma.
+ *
+ */
+void uvg_get_sub_coeff(coeff_t *dst, const coeff_t * const src, const int lcu_x, const int lcu_y, const int block_w, const int block_h, const int lcu_width)
+{
+  // Take subset of coeff array
+  const coeff_t* coeff_ptr = &src[lcu_x + lcu_y * lcu_width];
+  for (int j = 0; j < block_h; ++j) {
+    //memcpy(dst_coeff + (j * lcu_width), &coeff[j * tr_width], tr_width * sizeof(coeff_t));
+    memcpy(&dst[j * block_w], &coeff_ptr[j * lcu_width], block_w * sizeof(coeff_t));
+  }
+}
diff --git a/src/encode_coding_tree.h b/src/encode_coding_tree.h
index 9757a327..7410a073 100644
--- a/src/encode_coding_tree.h
+++ b/src/encode_coding_tree.h
@@ -116,3 +116,5 @@ void uvg_encode_last_significant_xy(cabac_data_t * const cabac,
   uint8_t lastpos_x, uint8_t lastpos_y,
   uint8_t width, uint8_t height,
   uint8_t type, uint8_t scan, double* bits_out);
+
+void uvg_get_sub_coeff(coeff_t* dst, const coeff_t* const src, const int lcu_x, const int lcu_y, const int block_w, const int block_h);
diff --git a/src/intra.c b/src/intra.c
index 0c8150ea..41a119d2 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -1520,7 +1520,7 @@ int uvg_get_isp_split_num(const int width, const int height, const int split_typ
 void uvg_get_isp_split_loc(cu_loc_t *loc, const int x, const int y, const int block_w, const int block_h, const int split_idx, const int split_type)
 {
   assert((split_idx >= 0 && split_idx <= 3) && "ISP split index must be in [0, 3].");
-  assert((split_type == ISP_MODE_NO_ISP && split_idx == 0) && "Trying to ISP split when split type = NO_ISP.");
+  assert((split_type != ISP_MODE_NO_ISP || split_idx == 0) && "Trying to ISP split when split type = NO_ISP.");
   int part_dim = block_w;
   if (split_type != ISP_MODE_NO_ISP) {
     part_dim = uvg_get_isp_split_dim(block_w, block_h, split_type);
diff --git a/src/rdo.c b/src/rdo.c
index c467dc94..bad372a9 100644
--- a/src/rdo.c
+++ b/src/rdo.c
@@ -305,11 +305,21 @@ static INLINE double get_coeff_cabac_cost(
 {
   const int width  = cu_loc->width;
   const int height = cu_loc->height;
+  const int sub_coeff_w = color == COLOR_Y ? cu_loc->width  : cu_loc->chroma_width;
+  const int sub_coeff_h = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
+  const int lcu_width = color == COLOR_Y ? LCU_WIDTH : LCU_WIDTH_C;
+
+  int x_local = cu_loc->x % LCU_WIDTH;
+  int y_local = cu_loc->y % LCU_WIDTH;
+
   // Make sure there are coeffs present
   bool found = false;
-  // ISP_TODO: this needs to be two separate x, y loops?
-  for (int i = 0; i < width * height; i++) {
-    if (coeff[i] != 0) {
+
+  coeff_t sub_coeff[TR_MAX_WIDTH * TR_MAX_WIDTH];
+  uvg_get_sub_coeff(sub_coeff, coeff, x_local, y_local, sub_coeff_w, sub_coeff_h, lcu_width);
+
+  for (int i = 0; i < sub_coeff_w * sub_coeff_h; i++) {
+    if (sub_coeff[i] != 0) {
       found = 1;
       break;
     }
@@ -332,7 +342,7 @@ static INLINE double get_coeff_cabac_cost(
   if(!tr_skip) {
     uvg_encode_coeff_nxn((encoder_state_t*) state,
                          &cabac_copy,
-                         coeff,
+                         sub_coeff,
                          cu_loc,
                          color,
                          scan_mode,
@@ -342,7 +352,7 @@ static INLINE double get_coeff_cabac_cost(
   else {
     uvg_encode_ts_residual((encoder_state_t* const)state,
       &cabac_copy,
-      coeff,
+      sub_coeff,
       width,
       height,
       color,
diff --git a/src/search.c b/src/search.c
index 293a807f..ddbdfd33 100644
--- a/src/search.c
+++ b/src/search.c
@@ -90,20 +90,27 @@ static INLINE void copy_cu_pixels(int x_local, int y_local, int width, lcu_t *fr
   }
 }
 
+// ISP_TODO: this needs to work with the new coeff cu orderr
 static INLINE void copy_cu_coeffs(const cu_loc_t *cu_loc, lcu_t *from, lcu_t *to, bool joint, enum
                                   uvg_tree_type tree_type)
 {
   if (tree_type != UVG_CHROMA_T) {
-    const int luma_z = xy_to_zorder(LCU_WIDTH, cu_loc->x, cu_loc->y);
-    copy_coeffs(&from->coeff.y[luma_z], &to->coeff.y[luma_z], cu_loc->width, cu_loc->height);
+    //const int luma_z = xy_to_zorder(LCU_WIDTH, cu_loc->x, cu_loc->y);
+    const int idx = (cu_loc->x % LCU_WIDTH) + ((cu_loc->y % LCU_WIDTH) * LCU_WIDTH);
+    copy_coeffs(&from->coeff.y[idx], &to->coeff.y[idx], cu_loc->width, cu_loc->height, LCU_WIDTH);
+    
   }
 
   if (from->rec.chroma_format != UVG_CSP_400 && tree_type != UVG_LUMA_T) {
-    const int chroma_z = xy_to_zorder(LCU_WIDTH_C, cu_loc->x >> (tree_type != UVG_CHROMA_T), cu_loc->y >> (tree_type != UVG_CHROMA_T));
-    copy_coeffs(&from->coeff.u[chroma_z], &to->coeff.u[chroma_z], cu_loc->chroma_width, cu_loc->chroma_height);
-    copy_coeffs(&from->coeff.v[chroma_z], &to->coeff.v[chroma_z], cu_loc->chroma_width, cu_loc->chroma_height);
+    //const int chroma_z = xy_to_zorder(LCU_WIDTH_C, cu_loc->x >> (tree_type != UVG_CHROMA_T), cu_loc->y >> (tree_type != UVG_CHROMA_T));
+    const int chroma_x = cu_loc->x >> (tree_type != UVG_CHROMA_T);
+    const int chroma_y = cu_loc->y >> (tree_type != UVG_CHROMA_T);
+
+    const int idx = (chroma_x % LCU_WIDTH_C) + ((chroma_y % LCU_WIDTH_C) * LCU_WIDTH_C);
+    copy_coeffs(&from->coeff.u[idx], &to->coeff.u[idx], cu_loc->chroma_width, cu_loc->chroma_height, LCU_WIDTH_C);
+    copy_coeffs(&from->coeff.v[idx], &to->coeff.v[idx], cu_loc->chroma_width, cu_loc->chroma_height, LCU_WIDTH_C);
     if (joint) {
-      copy_coeffs(&from->coeff.joint_uv[chroma_z], &to->coeff.joint_uv[chroma_z], cu_loc->chroma_width, cu_loc->chroma_height);
+      copy_coeffs(&from->coeff.joint_uv[idx], &to->coeff.joint_uv[idx], cu_loc->chroma_width, cu_loc->chroma_height, LCU_WIDTH_C);
     }
   }
 }
@@ -1672,7 +1679,7 @@ void uvg_search_lcu(encoder_state_t * const state, const int x, const int y, con
   copy_lcu_to_cu_data(state, x, y, &work_tree[0], tree_type);
 
   // Copy coeffs to encoder state.
-  copy_coeffs(work_tree[0].coeff.y, coeff->y, LCU_WIDTH, LCU_WIDTH);
+  copy_coeffs(work_tree[0].coeff.y, coeff->y, LCU_WIDTH, LCU_WIDTH, LCU_WIDTH);
 
   if(state->frame->slicetype == UVG_SLICE_I && state->encoder_control->cfg.dual_tree) {
     cost = search_cu(
@@ -1689,9 +1696,9 @@ void uvg_search_lcu(encoder_state_t * const state, const int x, const int y, con
     copy_lcu_to_cu_data(state, x, y, &work_tree[0], UVG_CHROMA_T);
   }
 
-  copy_coeffs(work_tree[0].coeff.u, coeff->u, LCU_WIDTH_C, LCU_WIDTH_C);
-  copy_coeffs(work_tree[0].coeff.v, coeff->v, LCU_WIDTH_C, LCU_WIDTH_C);
+  copy_coeffs(work_tree[0].coeff.u, coeff->u, LCU_WIDTH_C, LCU_WIDTH_C, LCU_WIDTH_C);
+  copy_coeffs(work_tree[0].coeff.v, coeff->v, LCU_WIDTH_C, LCU_WIDTH_C, LCU_WIDTH_C);
   if (state->encoder_control->cfg.jccr) {
-    copy_coeffs(work_tree[0].coeff.joint_uv, coeff->joint_uv, LCU_WIDTH_C, LCU_WIDTH_C);
+    copy_coeffs(work_tree[0].coeff.joint_uv, coeff->joint_uv, LCU_WIDTH_C, LCU_WIDTH_C, LCU_WIDTH_C);
   }
 }
diff --git a/src/strategies/generic/encode_coding_tree-generic.c b/src/strategies/generic/encode_coding_tree-generic.c
index 756cd6d6..acdfab94 100644
--- a/src/strategies/generic/encode_coding_tree-generic.c
+++ b/src/strategies/generic/encode_coding_tree-generic.c
@@ -64,6 +64,7 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
   const int y = cu_loc->y;
   const int width  = color == COLOR_Y ? cu_loc->width  : cu_loc->chroma_width;
   const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
+
   //const encoder_control_t * const encoder = state->encoder_control;
   //int c1 = 1;
   uint8_t last_coeff_x = 0;
@@ -94,14 +95,13 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
   unsigned scan_cg_last = (unsigned)-1;
   unsigned scan_pos_last = (unsigned)-1;
 
-  for (int j = 0; j < height; j++) {
-    for (int i = 0; i < width; i++) {
-      if (coeff[scan[i + j * width]]) {
-        scan_pos_last = i + j * width;
-        sig_coeffgroup_flag[scan_cg[(i + j * width) >> log2_cg_size]] = 1;
-      }
+  for (int i = 0; i < (width * height); ++i) {
+    if (coeff[scan[i]]) {
+      scan_pos_last = i;
+      sig_coeffgroup_flag[scan_cg[i >> log2_cg_size]] = 1;
     }
   }
+
   scan_cg_last = scan_pos_last >> log2_cg_size;
 
   int pos_last = scan[scan_pos_last];
@@ -139,7 +139,7 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
   int32_t temp_diag = -1;
   int32_t temp_sum = -1;
 
-  int32_t reg_bins = (width*width * 28) >> 4; //8 for 2x2
+  int32_t reg_bins = (width * height * 28) >> 4; //8 for 2x2
 
   // significant_coeff_flag
   for (i = scan_cg_last; i >= 0; i--) {
diff --git a/src/transform.c b/src/transform.c
index c5a38475..e5a3dc82 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -1102,24 +1102,13 @@ int uvg_quantize_residual_trskip(
     // we can skip this.
     uvg_pixels_blit(best->rec, rec_out, width, height, width, out_stride);
   }
-  copy_coeffs(best->coeff, coeff_out, width, height);
+  // TODO: copying coeffs here is very suspect
+  copy_coeffs(best->coeff, coeff_out, width, height, width);
 
   return best->has_coeffs;
 }
 
 
-static INLINE int translate_to_cu_order_idx(const int lcu_x, const int lcu_y, const int block_w, const int block_h, const int linear_idx)
-{
-  // ISP_TODO: get rid of all there temp variables after making sure this works
-  const int start_idx = lcu_x + lcu_y * LCU_WIDTH;
-  const int offset_x = linear_idx % block_w;
-  const int local_y = linear_idx / block_h;
-  const int offset_y = local_y * LCU_WIDTH;
-
-  return (start_idx + offset_x + offset_y);
-}
-
-
 /**
  * Calculate the residual coefficients for a single TU.
  *
@@ -1176,23 +1165,21 @@ static void quantize_tr_residual(
   coeff_t coeff[TR_MAX_WIDTH * TR_MAX_WIDTH];
   coeff_t *dst_coeff = NULL;
 
-  // ISP_TODO: use temp coeff array size MAX_TR_WIDTH^2 instead of coeff pointers
-  // ISP_TODO: inside temp coeff array, entries are in the old linear order. P�TK�
   switch (color) {
     case COLOR_Y:
       pred      = &lcu->rec.y[offset];
       ref       = &lcu->ref.y[offset];
-      dst_coeff = &lcu->coeff.y;
+      dst_coeff = &lcu->coeff.y[lcu_px.x + lcu_px.y * lcu_width];
       break;
     case COLOR_U:
       pred      = &lcu->rec.u[offset];
       ref       = &lcu->ref.u[offset];
-      dst_coeff = &lcu->coeff.u;
+      dst_coeff = &lcu->coeff.u[lcu_px.x + lcu_px.y * lcu_width];
       break;
     case COLOR_V:
       pred      = &lcu->rec.v[offset];
       ref       = &lcu->ref.v[offset];
-      dst_coeff = &lcu->coeff.v;
+      dst_coeff = &lcu->coeff.v[lcu_px.x + lcu_px.y * lcu_width];
       break;
     default:
       break;
@@ -1248,6 +1235,7 @@ static void quantize_tr_residual(
                                               lmcs_chroma_adj);
   } else {
     if(color == COLOR_UV) {
+      // ISP_TODO: fix this
       has_coeffs = uvg_quant_cbcr_residual(
         state,
         cur_pu,
@@ -1289,20 +1277,14 @@ static void quantize_tr_residual(
 
   cbf_clear(&cur_pu->cbf, depth, color);
   if (has_coeffs) {
-    const int coeffs_to_copy = tr_width * tr_height;
-    for (int i = 0; i < coeffs_to_copy; ++i) {
-      const coeff_t c = coeff[i];
-      const idx = translate_to_cu_order_idx(lcu_px.x, lcu_px.y, tr_width, tr_height, i);
-      dst_coeff[idx] = c;
+    for (int j = 0; j < tr_height; ++j) {
+      memcpy(&dst_coeff[j * lcu_width], &coeff[j * tr_width], tr_width * sizeof(coeff_t));
     }
     cbf_set(&cur_pu->cbf, depth, color);
   }
   else {
-    // ISP_TODO: if no coeffs, mem set width * height amount of coeffs to zero
-    int idx = lcu_px.x + lcu_px.y * LCU_WIDTH;
     for (int j = 0; j < tr_height; ++j) {
-      memset(dst_coeff[idx], 0, (sizeof(coeff_t) * tr_width));
-      idx += LCU_WIDTH;
+      memset(&dst_coeff[j * lcu_width], 0, (sizeof(coeff_t) * tr_width));
     }
   }
 }

From 4a21039e23db9ee8f677982339e6101acbf35e94 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Mon, 29 Aug 2022 12:37:23 +0300
Subject: [PATCH 050/254] [isp] Fix mistake in function declaration.

---
 src/encode_coding_tree.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/encode_coding_tree.h b/src/encode_coding_tree.h
index 7410a073..5b4ce324 100644
--- a/src/encode_coding_tree.h
+++ b/src/encode_coding_tree.h
@@ -117,4 +117,7 @@ void uvg_encode_last_significant_xy(cabac_data_t * const cabac,
   uint8_t width, uint8_t height,
   uint8_t type, uint8_t scan, double* bits_out);
 
-void uvg_get_sub_coeff(coeff_t* dst, const coeff_t* const src, const int lcu_x, const int lcu_y, const int block_w, const int block_h);
+void uvg_get_sub_coeff(coeff_t* dst, const coeff_t* const src, 
+                       const int lcu_x, const int lcu_y, 
+                       const int block_w, const int block_h, 
+                       const int lcu_width);

From 33ae02aae08ae405cbe0ccbc9f5e057f2f52f62f Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Tue, 30 Aug 2022 12:06:43 +0300
Subject: [PATCH 051/254] [isp] Fix mistake in isp cbf writing. Loop index was
 increased twice.

---
 src/intra.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/intra.c b/src/intra.c
index 41a119d2..cb52a4bd 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -1706,7 +1706,7 @@ void uvg_intra_recon_cu(
       uvg_quantize_lcu_residual(state, true, false, false,
         &split_loc, depth, cur_cu, lcu,
         false, tree_type);
-      search_data->best_isp_cbfs |= cbf_is_set(cur_cu->cbf, depth, COLOR_Y) << (i++);
+      search_data->best_isp_cbfs |= cbf_is_set(cur_cu->cbf, depth, COLOR_Y) << i;
     }
   }
   const bool has_luma = recon_luma && search_data->pred_cu.intra.isp_mode == ISP_MODE_NO_ISP;

From d050efcb871c58afa2041da704ae0a3483e8f527 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Wed, 31 Aug 2022 12:28:57 +0300
Subject: [PATCH 052/254] [isp] Fix error in last sig coeff function call.
 Height was not used. Fix cbf writing. Fix transform skip flag writing.

---
 src/encode_coding_tree.c                      | 24 +++++++++++--------
 src/search.c                                  | 14 +++++++----
 .../generic/encode_coding_tree-generic.c      |  2 +-
 3 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index 2421f7a0..fd5b6ce6 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -412,13 +412,13 @@ void uvg_encode_last_significant_xy(cabac_data_t * const cabac,
                                     uint8_t type, uint8_t scan, double* bits_out)
 {
   const int index_x = uvg_math_floor_log2(width);
-  const int index_y = uvg_math_floor_log2(width);
+  const int index_y = uvg_math_floor_log2(height);
   const int prefix_ctx[8] = { 0, 0, 0, 3, 6, 10, 15, 21 };
   //ToDo: own ctx_offset and shift for X and Y 
   uint8_t ctx_offset_x = type ? 0 : prefix_ctx[index_x];
   uint8_t ctx_offset_y = type ? 0 : prefix_ctx[index_y];
   uint8_t shift_x = type ? CLIP(0, 2, width >> 3) : (index_x + 1) >> 2;
-  uint8_t shift_y = type ? CLIP(0, 2, width >> 3) : (index_y + 1) >> 2;
+  uint8_t shift_y = type ? CLIP(0, 2, height >> 3) : (index_y + 1) >> 2;
   double bits = 0;
 
   cabac_ctx_t *base_ctx_x = (type ? cabac->ctx.cu_ctx_last_x_chroma : cabac->ctx.cu_ctx_last_x_luma);
@@ -557,7 +557,7 @@ static void encode_transform_unit(
     // CoeffNxN
     // Residual Coding
 
-    if(state->encoder_control->cfg.trskip_enable && width <= (1 << state->encoder_control->cfg.trskip_max_size)) {
+    if(state->encoder_control->cfg.trskip_enable && width <= (1 << state->encoder_control->cfg.trskip_max_size && !(cur_pu->type == CU_INTRA && cur_pu->intra.isp_mode != ISP_MODE_NO_ISP))) {
       cabac->cur_ctx = &cabac->ctx.transform_skip_model_luma;
       CABAC_BIN(cabac, cur_pu->tr_idx == MTS_SKIP, "transform_skip_flag");
       DBG_YUVIEW_VALUE(state->frame->poc, DBG_YUVIEW_TR_SKIP, x, y, width, width, (cur_pu->tr_idx == MTS_SKIP) ? 1 : 0);
@@ -619,7 +619,8 @@ static void encode_transform_coeff(
   bool only_chroma,
   lcu_coeff_t* coeff,
   enum uvg_tree_type tree_type,
-  bool last_split)                // Always true except when writing sub partition coeffs (ISP)
+  bool last_split,
+  int *luma_cbf_ctx)                // Always true except when writing sub partition coeffs (ISP)
 {
   cabac_data_t * const cabac = &state->cabac;
   const int x = cu_loc->x;
@@ -706,7 +707,7 @@ static void encode_transform_coeff(
         cu_loc_t loc;
         uvg_cu_loc_ctor(&loc, (x + i * split_width), (y + j * split_height), width >> 1, height >> 1);
 
-        encode_transform_coeff(state, &loc, depth + 1, tr_depth + 1, cb_flag_u, cb_flag_v, only_chroma, coeff, tree_type, true);
+        encode_transform_coeff(state, &loc, depth + 1, tr_depth + 1, cb_flag_u, cb_flag_v, only_chroma, coeff, tree_type, true, luma_cbf_ctx);
       }
     }
     return;
@@ -718,8 +719,9 @@ static void encode_transform_coeff(
   // - we have chroma coefficients at this level
   // When it is not present, it is inferred to be 1.
   if ((cur_cu->type == CU_INTRA || tr_depth > 0 || cb_flag_u || cb_flag_v) && !only_chroma && tree_type != UVG_CHROMA_T) {
-      cabac->cur_ctx = &(cabac->ctx.qt_cbf_model_luma[0]);
+      cabac->cur_ctx = &(cabac->ctx.qt_cbf_model_luma[*luma_cbf_ctx]);
       CABAC_BIN(cabac, cb_flag_y, "cbf_luma");
+      *luma_cbf_ctx = 2 + cb_flag_y;
   }
 
   if (cb_flag_y | cb_flag_u | cb_flag_v) {
@@ -1613,7 +1615,8 @@ void uvg_encode_coding_tree(
       }
       // Code (possible) coeffs to bitstream
       if (cbf) {
-        encode_transform_coeff(state, &cu_loc, depth, 0, 0, 0, 0, coeff, tree_type, true);
+        int luma_cbf_ctx = 0;
+        encode_transform_coeff(state, &cu_loc, depth, 0, 0, 0, 0, coeff, tree_type, true, &luma_cbf_ctx);
       }
 
       encode_mts_idx(state, cabac, cur_cu);
@@ -1628,6 +1631,7 @@ void uvg_encode_coding_tree(
     if (state->encoder_control->chroma_format != UVG_CSP_400 && depth != 4 && tree_type == UVG_BOTH_T) {
       encode_chroma_intra_cu(cabac, cur_cu, state->encoder_control->cfg.cclm, NULL);
     }
+    int luma_cbf_ctx = 0;
 
     if (tree_type != UVG_CHROMA_T) {
       // Cycle through sub partitions if ISP enabled.
@@ -1635,14 +1639,14 @@ void uvg_encode_coding_tree(
       // Small blocks are split only twice.
       int split_type = cur_cu->intra.isp_mode;
       int split_limit = split_type == ISP_MODE_NO_ISP ? 1 : uvg_get_isp_split_num(cu_width, cu_height, split_type);
-
+      luma_cbf_ctx = split_limit != 1 ? 2 : 0;
       for (int i = 0; i < split_limit; ++i) {
         cu_loc_t split_loc;
         uvg_get_isp_split_loc(&split_loc, x, y, cu_width, cu_height, i, split_type);
 
         // Check if last split to write chroma
         bool last_split = (i + 1) == split_limit;
-        encode_transform_coeff(state, &split_loc, depth, 0, 0, 0, 0, coeff, tree_type, last_split);
+        encode_transform_coeff(state, &split_loc, depth, 0, 0, 0, 0, coeff, tree_type, last_split, &luma_cbf_ctx);
       }
     }
 
@@ -1661,7 +1665,7 @@ void uvg_encode_coding_tree(
       tmp->violates_lfnst_constrained_luma = false;
       tmp->violates_lfnst_constrained_chroma = false;
       tmp->lfnst_last_scan_pos = false;
-      encode_transform_coeff(state, &cu_loc, depth, 0, 0, 0, 1, coeff, tree_type, true);
+      encode_transform_coeff(state, &cu_loc, depth, 0, 0, 0, 1, coeff, tree_type, true, &luma_cbf_ctx);
       // Write LFNST only once for single tree structure
       encode_lfnst_idx(state, cabac, tmp, x, y, depth, cu_width, cu_height, tree_type, COLOR_UV);
     }
diff --git a/src/search.c b/src/search.c
index ddbdfd33..633f95f7 100644
--- a/src/search.c
+++ b/src/search.c
@@ -369,11 +369,13 @@ double uvg_cu_rd_cost_luma(const encoder_state_t *const state,
     }
   }
   else {
-    cabac_ctx_t* ctx = &(cabac->ctx.qt_cbf_model_luma[0]);
     // TODO: 8x4 CUs
     for (int i = 0; i < 4; i++) {
+      int luma_ctx = 2;
       if (i != 3 && isp_cbf != 0x8) {
-        CABAC_FBITS_UPDATE(cabac, ctx, (isp_cbf >> i) & 1, tr_tree_bits, "cbf_y_search");
+        const int flag = (isp_cbf >> i) & 1;
+        CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_cbf_model_luma[luma_ctx]), flag, tr_tree_bits, "cbf_y_search");
+        luma_ctx = 2 + flag;
       }
     }
   }
@@ -598,11 +600,13 @@ static double cu_rd_cost_tr_split_accurate(
     }
   }
   else {
-    cabac_ctx_t* ctx = &(cabac->ctx.qt_cbf_model_luma[0]);
     // TODO: 8x4 CUs
     for (int i = 0; i < 4; i++) {
+      int luma_ctx = 2;
       if (i != 3 && isp_cbf != 0x8) {
-        CABAC_FBITS_UPDATE(cabac, ctx, (isp_cbf >> i) & 1, tr_tree_bits, "cbf_y_search");
+        const int flag = (isp_cbf >> i) & 1;
+        CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_cbf_model_luma[luma_ctx]), flag, tr_tree_bits, "cbf_y_search");
+        luma_ctx = 2 + flag;
       }
     }
   }
@@ -1147,7 +1151,7 @@ static double search_cu(
         cu_loc_t isp_loc;
         uvg_get_isp_split_loc(&isp_loc, x, y, cu_width, cu_height, i, split_type);
         //search_data->best_isp_cbfs |= cbf_is_set(cur_cu->cbf, depth, COLOR_Y) << (i++);
-        cu_info_t* split_cu = LCU_GET_CU_AT_PX(lcu, isp_loc.x, isp_loc.y);
+        cu_info_t* split_cu = LCU_GET_CU_AT_PX(lcu, isp_loc.x % LCU_WIDTH, isp_loc.y % LCU_WIDTH);
         bool cur_cbf = (intra_search.best_isp_cbfs >> i) & 1;
         cbf_clear(&split_cu->cbf, depth, COLOR_Y);
         if (cur_cbf) {
diff --git a/src/strategies/generic/encode_coding_tree-generic.c b/src/strategies/generic/encode_coding_tree-generic.c
index acdfab94..d0c87f4a 100644
--- a/src/strategies/generic/encode_coding_tree-generic.c
+++ b/src/strategies/generic/encode_coding_tree-generic.c
@@ -126,7 +126,7 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
     last_coeff_x,
     last_coeff_y,
     width,
-    width,
+    height,
     color,
     scan_mode,
     bits_out);

From 7398e5843108defb3e285b4277681e412f44aece Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Wed, 31 Aug 2022 13:53:17 +0300
Subject: [PATCH 053/254] [isp] Fix coeff cost calculation. Coeff arrays were
 indexed wrongly.

---
 src/rdo.c       | 27 +++++++++++++++++++--------
 src/rdo.h       |  3 ++-
 src/search.c    | 36 ++++++++++++++++++++----------------
 src/transform.c |  6 ++++--
 src/transform.h |  3 +++
 5 files changed, 48 insertions(+), 27 deletions(-)

diff --git a/src/rdo.c b/src/rdo.c
index bad372a9..c9f2db05 100644
--- a/src/rdo.c
+++ b/src/rdo.c
@@ -301,7 +301,8 @@ static INLINE double get_coeff_cabac_cost(
   color_t color,
   int8_t scan_mode,
   int8_t tr_skip,
-  cu_info_t* cur_tu)
+  cu_info_t* cur_tu,
+  int coeff_order)
 {
   const int width  = cu_loc->width;
   const int height = cu_loc->height;
@@ -315,11 +316,20 @@ static INLINE double get_coeff_cabac_cost(
   // Make sure there are coeffs present
   bool found = false;
 
+  coeff_t* coeff_ptr = NULL;
   coeff_t sub_coeff[TR_MAX_WIDTH * TR_MAX_WIDTH];
-  uvg_get_sub_coeff(sub_coeff, coeff, x_local, y_local, sub_coeff_w, sub_coeff_h, lcu_width);
+
+  if (coeff_order == COEFF_ORDER_LINEAR) {
+    coeff_ptr = coeff;
+  }
+  else {
+    // Coeff order CU
+    uvg_get_sub_coeff(sub_coeff, coeff, x_local, y_local, sub_coeff_w, sub_coeff_h, lcu_width);
+    coeff_ptr = sub_coeff;
+  }
 
   for (int i = 0; i < sub_coeff_w * sub_coeff_h; i++) {
-    if (sub_coeff[i] != 0) {
+    if (coeff_ptr[i] != 0) {
       found = 1;
       break;
     }
@@ -342,7 +352,7 @@ static INLINE double get_coeff_cabac_cost(
   if(!tr_skip) {
     uvg_encode_coeff_nxn((encoder_state_t*) state,
                          &cabac_copy,
-                         sub_coeff,
+                         coeff_ptr,
                          cu_loc,
                          color,
                          scan_mode,
@@ -352,7 +362,7 @@ static INLINE double get_coeff_cabac_cost(
   else {
     uvg_encode_ts_residual((encoder_state_t* const)state,
       &cabac_copy,
-      sub_coeff,
+      coeff_ptr,
       width,
       height,
       color,
@@ -408,7 +418,8 @@ double uvg_get_coeff_cost(
   cu_loc_t *cu_loc,
   color_t color,
   int8_t scan_mode,
-  int8_t tr_skip)
+  int8_t tr_skip,
+  int coeff_order)
 {
   uint8_t save_cccs = state->encoder_control->cfg.fastrd_sampling_on;
   uint8_t check_accuracy = state->encoder_control->cfg.fastrd_accuracy_check_on;
@@ -428,13 +439,13 @@ double uvg_get_coeff_cost(
       uint64_t weights = uvg_fast_coeff_get_weights(state);
       uint32_t fast_cost = uvg_fast_coeff_cost(coeff, width, height, weights);
       if (check_accuracy) {
-        double ccc = get_coeff_cabac_cost(state, coeff, cu_loc, color, scan_mode, tr_skip, cur_tu);
+        double ccc = get_coeff_cabac_cost(state, coeff, cu_loc, color, scan_mode, tr_skip, cur_tu, coeff_order);
         save_accuracy(state->qp, ccc, fast_cost);
       }
       return fast_cost;
     }
   } else {
-    double ccc = get_coeff_cabac_cost(state, coeff, cu_loc, color, scan_mode, tr_skip, cur_tu);
+    double ccc = get_coeff_cabac_cost(state, coeff, cu_loc, color, scan_mode, tr_skip, cur_tu, coeff_order);
     if (save_cccs) {
       save_ccc(state->qp, coeff, width * width, ccc);
     }
diff --git a/src/rdo.h b/src/rdo.h
index c9b88df3..eb9714f6 100644
--- a/src/rdo.h
+++ b/src/rdo.h
@@ -76,7 +76,8 @@ double uvg_get_coeff_cost(
   cu_loc_t *cu_loc,
   color_t color,
   int8_t scan_mode,
-  int8_t tr_skip);
+  int8_t tr_skip,
+  int coeff_order);
 
 int32_t uvg_get_ic_rate(encoder_state_t *state, uint32_t abs_level, uint16_t ctx_num_gt1, uint16_t ctx_num_gt2, uint16_t ctx_num_par,
                     uint16_t abs_go_rice, uint32_t reg_bins, int8_t type, int use_limited_prefix_length);
diff --git a/src/search.c b/src/search.c
index 633f95f7..66eac299 100644
--- a/src/search.c
+++ b/src/search.c
@@ -393,9 +393,10 @@ double uvg_cu_rd_cost_luma(const encoder_state_t *const state,
   if (!skip_residual_coding) {
     int8_t luma_scan_mode = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode, depth);
     if (pred_cu->type == CU_INTER || pred_cu->intra.isp_mode == ISP_MODE_NO_ISP) {
-      const coeff_t* coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, x_px, y_px)];
+      //const coeff_t* coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, x_px, y_px)];
+      const coeff_t* coeffs = lcu->coeff.y;
 
-      coeff_bits += uvg_get_coeff_cost(state, coeffs, NULL, &loc, 0, luma_scan_mode, pred_cu->tr_idx == MTS_SKIP);
+      coeff_bits += uvg_get_coeff_cost(state, coeffs, NULL, &loc, 0, luma_scan_mode, pred_cu->tr_idx == MTS_SKIP, COEFF_ORDER_CU);
     }
     else {
       int split_type = pred_cu->intra.isp_mode;
@@ -408,9 +409,10 @@ double uvg_cu_rd_cost_luma(const encoder_state_t *const state,
         const int part_y = split_loc.y;
 
         // TODO: maybe just pass the cu_loc_t to these functions
-        const coeff_t* coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, part_x, part_y)];
+        //const coeff_t* coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, part_x, part_y)];
+        const coeff_t* coeffs = lcu->coeff.y;
 
-        coeff_bits += uvg_get_coeff_cost(state, coeffs, NULL, &split_loc, 0, luma_scan_mode, pred_cu->tr_idx == MTS_SKIP);
+        coeff_bits += uvg_get_coeff_cost(state, coeffs, NULL, &split_loc, 0, luma_scan_mode, pred_cu->tr_idx == MTS_SKIP, COEFF_ORDER_CU);
       }
     }
   }
@@ -501,14 +503,14 @@ double uvg_cu_rd_cost_chroma(const encoder_state_t *const state,
 
   if (!skip_residual_coding) {
     int8_t scan_order = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode_chroma, depth);
-    const int index = xy_to_zorder(LCU_WIDTH_C, lcu_px.x, lcu_px.y);
+    //const int index = xy_to_zorder(LCU_WIDTH_C, lcu_px.x, lcu_px.y);
 
     if((pred_cu->joint_cb_cr & 3) == 0){
-      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.u[index], NULL, &loc, 2, scan_order, 0);
-      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.v[index], NULL, &loc, 2, scan_order, 0);
+      coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.u, NULL, &loc, 2, scan_order, 0, COEFF_ORDER_CU);
+      coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.v, NULL, &loc, 2, scan_order, 0, COEFF_ORDER_CU);
     }
     else {
-      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.joint_uv[index], NULL, &loc, 2, scan_order, 0);
+      coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.joint_uv, NULL, &loc, 2, scan_order, 0, COEFF_ORDER_CU);
       
     }
   }
@@ -638,9 +640,10 @@ static double cu_rd_cost_tr_split_accurate(
     }
     int8_t luma_scan_mode = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode, depth);
     if (pred_cu->type == CU_INTER || pred_cu->intra.isp_mode == ISP_MODE_NO_ISP) {
-      const coeff_t* coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, x_px, y_px)];
+      //const coeff_t* coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, x_px, y_px)];
+      const coeff_t* coeffs = lcu->coeff.y;
 
-      coeff_bits += uvg_get_coeff_cost(state, coeffs, tr_cu, &loc, 0, luma_scan_mode, pred_cu->tr_idx == MTS_SKIP);
+      coeff_bits += uvg_get_coeff_cost(state, coeffs, tr_cu, &loc, 0, luma_scan_mode, pred_cu->tr_idx == MTS_SKIP, COEFF_ORDER_CU);
     }
     else {
       int split_type = pred_cu->intra.isp_mode;
@@ -653,9 +656,10 @@ static double cu_rd_cost_tr_split_accurate(
         const int part_y = split_loc.y;
 
         // TODO: maybe just pass the cu_loc_t to these functions
-        const coeff_t* coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, part_x, part_y)];
+        //const coeff_t* coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, part_x, part_y)];
+        const coeff_t* coeffs = lcu->coeff.y;
 
-        coeff_bits += uvg_get_coeff_cost(state, coeffs, tr_cu, &split_loc, 0, luma_scan_mode, pred_cu->tr_idx == MTS_SKIP);
+        coeff_bits += uvg_get_coeff_cost(state, coeffs, tr_cu, &split_loc, 0, luma_scan_mode, pred_cu->tr_idx == MTS_SKIP, COEFF_ORDER_CU);
       }
     }
   }
@@ -687,7 +691,7 @@ static double cu_rd_cost_tr_split_accurate(
     const int chroma_width  = MAX(4, LCU_WIDTH >> (depth + 1));
     const int chroma_height = chroma_width; // TODO: height for non-square blocks
     int8_t scan_order = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode_chroma, depth);
-    const unsigned index = xy_to_zorder(LCU_WIDTH_C, lcu_px.x, lcu_px.y);
+    //const unsigned index = xy_to_zorder(LCU_WIDTH_C, lcu_px.x, lcu_px.y);
 
     const bool chroma_can_use_tr_skip = state->encoder_control->cfg.trskip_enable && chroma_width <= (1 << state->encoder_control->cfg.trskip_max_size);
     if(pred_cu->joint_cb_cr == 0) {
@@ -707,8 +711,8 @@ static double cu_rd_cost_tr_split_accurate(
       if(chroma_can_use_tr_skip && cb_flag_v) {
         CABAC_FBITS_UPDATE(cabac, &cabac->ctx.transform_skip_model_chroma, tr_cu->tr_skip & 4, tr_tree_bits, "transform_skip_flag");        
       }
-      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.u[index], tr_cu, &loc, COLOR_U, scan_order, tr_cu->tr_skip & 2);
-      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.v[index], tr_cu, &loc, COLOR_V, scan_order, tr_cu->tr_skip & 4);
+      coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.u, tr_cu, &loc, COLOR_U, scan_order, tr_cu->tr_skip & 2, COEFF_ORDER_CU);
+      coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.v, tr_cu, &loc, COLOR_V, scan_order, tr_cu->tr_skip & 4, COEFF_ORDER_CU);
       
     }
     else {
@@ -725,7 +729,7 @@ static double cu_rd_cost_tr_split_accurate(
       if (chroma_can_use_tr_skip) {
         CABAC_FBITS_UPDATE(cabac, &cabac->ctx.transform_skip_model_chroma, tr_cu->tr_skip & 2, tr_tree_bits, "transform_skip_flag");
       }
-      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.joint_uv[index], tr_cu, &loc, COLOR_U, scan_order, 0);
+      coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.joint_uv, tr_cu, &loc, COLOR_U, scan_order, 0, COEFF_ORDER_CU);
     }
   }
 
diff --git a/src/transform.c b/src/transform.c
index e5a3dc82..30e3bd64 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -694,7 +694,8 @@ void uvg_chroma_transform_search(
         cu_loc,
         COLOR_U,
         scan_order,
-        transforms[i] == CHROMA_TS);
+        transforms[i] == CHROMA_TS,
+        COEFF_ORDER_LINEAR);
       u_bits += coeff_cost;
     }
     if (cbf_v && !IS_JCCR_MODE(transforms[i])) {
@@ -710,7 +711,8 @@ void uvg_chroma_transform_search(
         cu_loc,
         COLOR_V,
         scan_order,
-        transforms[i] == CHROMA_TS);
+        transforms[i] == CHROMA_TS,
+        COEFF_ORDER_LINEAR);
     }
     if((depth == 4 || tree_type == UVG_CHROMA_T) && state->encoder_control->cfg.lfnst && 0) {
       if(uvg_is_lfnst_allowed(state, pred_cu, width, height, 0, 0 , UVG_CHROMA_T, COLOR_UV, lcu)) {
diff --git a/src/transform.h b/src/transform.h
index 69a9450f..78a2325a 100644
--- a/src/transform.h
+++ b/src/transform.h
@@ -47,6 +47,9 @@ extern const uint8_t uvg_g_chroma_scale[58];
 extern const int16_t uvg_g_inv_quant_scales[6];
 extern const int16_t uvg_g_quant_scales[6];
 
+#define COEFF_ORDER_LINEAR 0
+#define COEFF_ORDER_CU 1
+
 void uvg_transformskip(const encoder_control_t *encoder, int16_t *block,int16_t *coeff, int8_t width, int8_t height);
 void uvg_itransformskip(const encoder_control_t *encoder, int16_t *block,int16_t *coeff, int8_t width, int8_t height);
 

From d8d206365c60afdbde4632658bad0c7f44d8c802 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Wed, 31 Aug 2022 13:57:00 +0300
Subject: [PATCH 054/254] [isp] Fix jccr coeffs.

---
 src/transform.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/transform.c b/src/transform.c
index 30e3bd64..8d0c3645 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -1156,7 +1156,7 @@ static void quantize_tr_residual(
   const coeff_scan_order_t scan_idx =
     uvg_get_scan_order(cur_pu->type, mode, depth); // Height does not affect this
   const int offset = lcu_px.x + lcu_px.y * lcu_width;
-  const int z_index = xy_to_zorder(lcu_width, lcu_px.x, lcu_px.y);
+  //const int z_index = xy_to_zorder(lcu_width, lcu_px.x, lcu_px.y);
 
   // Pointers to current location in arrays with prediction. The
   // reconstruction will be written to this array.
@@ -1183,6 +1183,9 @@ static void quantize_tr_residual(
       ref       = &lcu->ref.v[offset];
       dst_coeff = &lcu->coeff.v[lcu_px.x + lcu_px.y * lcu_width];
       break;
+    case COLOR_UV:
+      dst_coeff = &lcu->coeff.joint_uv[lcu_px.x + lcu_px.y * lcu_width];
+      break;
     default:
       break;
   }
@@ -1237,7 +1240,6 @@ static void quantize_tr_residual(
                                               lmcs_chroma_adj);
   } else {
     if(color == COLOR_UV) {
-      // ISP_TODO: fix this
       has_coeffs = uvg_quant_cbcr_residual(
         state,
         cur_pu,
@@ -1249,7 +1251,7 @@ static void quantize_tr_residual(
         &lcu->ref.u[offset], &lcu->ref.v[offset],
         &lcu->rec.u[offset], &lcu->rec.v[offset],
         &lcu->rec.u[offset], &lcu->rec.v[offset],
-        &lcu->coeff.joint_uv[z_index],
+        coeff,
         early_skip,
         lmcs_chroma_adj,
         tree_type

From 33cd44f11bd70cf4cd03122a8d36ed595bca1bbf Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Wed, 31 Aug 2022 14:54:30 +0300
Subject: [PATCH 055/254] [isp] Fix chroma coeff writing for ISP.

---
 src/encode_coding_tree.c | 19 +++++++++++--------
 src/search.c             | 39 +++++++++++++++++++++++++--------------
 2 files changed, 36 insertions(+), 22 deletions(-)

diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index fd5b6ce6..9b0bf884 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -527,7 +527,8 @@ static void encode_transform_unit(
   bool only_chroma,
   lcu_coeff_t* coeff,
   enum uvg_tree_type tree_type,
-  bool last_split)
+  bool last_split,
+  const cu_loc_t *original_loc)               // Original cu dimensions, before CU split
 {
   assert(depth >= 1 && depth <= MAX_PU_DEPTH);
 
@@ -596,7 +597,8 @@ static void encode_transform_unit(
                         cbf_is_set(cur_pu->cbf, depth, COLOR_V);
   if ((chroma_cbf_set || joint_chroma) && last_split) {
     //Need to drop const to get lfnst constraints
-    encode_chroma_tu(state, cu_loc, depth, (cu_info_t*)cur_pu, &scan_idx, coeff, joint_chroma, tree_type);
+    // Use original dimensions instead of ISP split dimensions
+    encode_chroma_tu(state, original_loc, depth, (cu_info_t*)cur_pu, &scan_idx, coeff, joint_chroma, tree_type);
   }
 }
 
@@ -620,7 +622,8 @@ static void encode_transform_coeff(
   lcu_coeff_t* coeff,
   enum uvg_tree_type tree_type,
   bool last_split,
-  int *luma_cbf_ctx)                // Always true except when writing sub partition coeffs (ISP)
+  int *luma_cbf_ctx,            // Always true except when writing sub partition coeffs (ISP)
+  cu_loc_t *original_loc)       // Original dimensions before ISP split
 {
   cabac_data_t * const cabac = &state->cabac;
   const int x = cu_loc->x;
@@ -707,7 +710,7 @@ static void encode_transform_coeff(
         cu_loc_t loc;
         uvg_cu_loc_ctor(&loc, (x + i * split_width), (y + j * split_height), width >> 1, height >> 1);
 
-        encode_transform_coeff(state, &loc, depth + 1, tr_depth + 1, cb_flag_u, cb_flag_v, only_chroma, coeff, tree_type, true, luma_cbf_ctx);
+        encode_transform_coeff(state, &loc, depth + 1, tr_depth + 1, cb_flag_u, cb_flag_v, only_chroma, coeff, tree_type, true, luma_cbf_ctx, &loc);
       }
     }
     return;
@@ -761,7 +764,7 @@ static void encode_transform_coeff(
       CABAC_BIN(cabac, cur_pu->joint_cb_cr != 0, "tu_joint_cbcr_residual_flag");
     }
 
-    encode_transform_unit(state, cu_loc, depth, only_chroma, coeff, tree_type, last_split);
+    encode_transform_unit(state, cu_loc, depth, only_chroma, coeff, tree_type, last_split, original_loc);
   }
 }
 
@@ -1616,7 +1619,7 @@ void uvg_encode_coding_tree(
       // Code (possible) coeffs to bitstream
       if (cbf) {
         int luma_cbf_ctx = 0;
-        encode_transform_coeff(state, &cu_loc, depth, 0, 0, 0, 0, coeff, tree_type, true, &luma_cbf_ctx);
+        encode_transform_coeff(state, &cu_loc, depth, 0, 0, 0, 0, coeff, tree_type, true, &luma_cbf_ctx, &cu_loc);
       }
 
       encode_mts_idx(state, cabac, cur_cu);
@@ -1646,7 +1649,7 @@ void uvg_encode_coding_tree(
 
         // Check if last split to write chroma
         bool last_split = (i + 1) == split_limit;
-        encode_transform_coeff(state, &split_loc, depth, 0, 0, 0, 0, coeff, tree_type, last_split, &luma_cbf_ctx);
+        encode_transform_coeff(state, &split_loc, depth, 0, 0, 0, 0, coeff, tree_type, last_split, &luma_cbf_ctx, &cu_loc);
       }
     }
 
@@ -1665,7 +1668,7 @@ void uvg_encode_coding_tree(
       tmp->violates_lfnst_constrained_luma = false;
       tmp->violates_lfnst_constrained_chroma = false;
       tmp->lfnst_last_scan_pos = false;
-      encode_transform_coeff(state, &cu_loc, depth, 0, 0, 0, 1, coeff, tree_type, true, &luma_cbf_ctx);
+      encode_transform_coeff(state, &cu_loc, depth, 0, 0, 0, 1, coeff, tree_type, true, &luma_cbf_ctx, &cu_loc);
       // Write LFNST only once for single tree structure
       encode_lfnst_idx(state, cabac, tmp, x, y, depth, cu_width, cu_height, tree_type, COLOR_UV);
     }
diff --git a/src/search.c b/src/search.c
index 66eac299..5dad1b3b 100644
--- a/src/search.c
+++ b/src/search.c
@@ -1148,20 +1148,7 @@ static double search_cu(
                          depth, &intra_search,
                          NULL, 
                          lcu, tree_type,recon_luma,recon_chroma);
-      // Set isp split cbfs here
-      const int split_type = intra_search.pred_cu.intra.isp_mode;
-      const int split_num = split_type == ISP_MODE_NO_ISP ? 0 : uvg_get_isp_split_num(cu_width, cu_height, split_type);
-      for (int i = 0; i < split_num; ++i) {
-        cu_loc_t isp_loc;
-        uvg_get_isp_split_loc(&isp_loc, x, y, cu_width, cu_height, i, split_type);
-        //search_data->best_isp_cbfs |= cbf_is_set(cur_cu->cbf, depth, COLOR_Y) << (i++);
-        cu_info_t* split_cu = LCU_GET_CU_AT_PX(lcu, isp_loc.x % LCU_WIDTH, isp_loc.y % LCU_WIDTH);
-        bool cur_cbf = (intra_search.best_isp_cbfs >> i) & 1;
-        cbf_clear(&split_cu->cbf, depth, COLOR_Y);
-        if (cur_cbf) {
-          cbf_set(&split_cu->cbf, depth, COLOR_Y);
-        }
-      }
+
 
       if(depth == 4 && x % 8 && y % 8 && tree_type != UVG_LUMA_T && state->encoder_control->chroma_format != UVG_CSP_400) {
         intra_search.pred_cu.intra.mode_chroma = cur_cu->intra.mode_chroma;
@@ -1173,6 +1160,30 @@ static double search_cu(
                            tree_type,false,true);
       }
       if (cur_cu->joint_cb_cr == 4) cur_cu->joint_cb_cr = 0;
+
+      // Set isp split cbfs here
+      const int split_type = intra_search.pred_cu.intra.isp_mode;
+      const int split_num = split_type == ISP_MODE_NO_ISP ? 0 : uvg_get_isp_split_num(cu_width, cu_height, split_type);
+
+      const int cbf_cb = cbf_is_set(cur_cu->cbf, depth, COLOR_U);
+      const int cbf_cr = cbf_is_set(cur_cu->cbf, depth, COLOR_V);
+      const int jccr = cur_cu->joint_cb_cr;
+      for (int i = 0; i < split_num; ++i) {
+        cu_loc_t isp_loc;
+        uvg_get_isp_split_loc(&isp_loc, x, y, cu_width, cu_height, i, split_type);
+        //search_data->best_isp_cbfs |= cbf_is_set(cur_cu->cbf, depth, COLOR_Y) << (i++);
+        cu_info_t* split_cu = LCU_GET_CU_AT_PX(lcu, isp_loc.x % LCU_WIDTH, isp_loc.y % LCU_WIDTH);
+        bool cur_cbf = (intra_search.best_isp_cbfs >> i) & 1;
+        cbf_clear(&split_cu->cbf, depth, COLOR_Y);
+        cbf_clear(&split_cu->cbf, depth, COLOR_U);
+        cbf_clear(&split_cu->cbf, depth, COLOR_V);
+        if (cur_cbf) {
+          cbf_set(&split_cu->cbf, depth, COLOR_Y);
+        }
+        if(cbf_cb) cbf_set(&split_cu->cbf, depth, COLOR_U);
+        if(cbf_cr) cbf_set(&split_cu->cbf, depth, COLOR_V);
+        split_cu->joint_cb_cr = jccr;
+      }
       lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu);
 
 

From 59292d880888db093b24755e9337e0e696ccb602 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Fri, 2 Sep 2022 11:03:08 +0300
Subject: [PATCH 056/254] [isp] Add extra logic to reference building to
 accommodate ISP. Remove some asserts which were invalidated by ISP.

---
 src/cu.c                               |   3 +-
 src/intra.c                            | 110 +++++++++++++++++++++----
 src/intra.h                            |   1 +
 src/search_intra.c                     |  12 +--
 src/strategies/generic/quant-generic.c |   6 +-
 5 files changed, 106 insertions(+), 26 deletions(-)

diff --git a/src/cu.c b/src/cu.c
index f47f5cf3..10d99943 100644
--- a/src/cu.c
+++ b/src/cu.c
@@ -265,7 +265,8 @@ void uvg_cu_loc_ctor(cu_loc_t* loc, int x, int y, int width, int height)
 {
   assert(x >= 0 && y >= 0 && width >= 0 && height >= 0 && "Cannot give negative coordinates or block dimensions.");
   assert(!(width > LCU_WIDTH || height > LCU_WIDTH) && "Luma CU dimension exceeds maximum (dim > LCU_WIDTH).");
-  assert(!(width < 4 || height < 4) && "Luma CU dimension smaller than 4."); // TODO: change if luma size 2 is allowed
+  // This check is no longer valid. With non-square blocks and ISP enabled, even 1x16 and 16x1 (ISP needs at least 16 samples) blocks are valid
+  //assert(!(width < 4 || height < 4) && "Luma CU dimension smaller than 4.");
   
   loc->x = x;
   loc->y = y;
diff --git a/src/intra.c b/src/intra.c
index cb52a4bd..12af3362 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -930,7 +930,7 @@ static void intra_predict_regular(
   uint8_t multi_ref_index = color == COLOR_Y ? multi_ref_idx : 0;
 
   const uvg_intra_ref *used_ref = &refs->ref;
-  if (cfg->intra_smoothing_disabled || color != COLOR_Y || mode == 1 || (width == 4 && height == 4) || multi_ref_index || width != height /*Fake ISP*/) {
+  if (cfg->intra_smoothing_disabled || color != COLOR_Y || mode == 1 || (width == 4 && height == 4) || multi_ref_index || width != height /*ISP_TODO: replace this fake ISP check*/) {
     // For chroma, DC and 4x4 blocks, always use unfiltered reference.
   } else if (mode == 0) {
     // Otherwise, use filtered for planar.
@@ -984,14 +984,15 @@ void uvg_intra_build_reference_any(
   const lcu_t *const lcu,
   uvg_intra_references *const refs,
   const uint8_t multi_ref_idx,
-  uvg_pixel *extra_ref_lines)
+  uvg_pixel *extra_ref_lines,
+  bool is_isp)
 {
   const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
   const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
   const int log2_width =  uvg_g_convert_to_log2[width];
   const int log2_height = uvg_g_convert_to_log2[height];
 
-  assert((log2_width >= 2 && log2_width <= 5) && (log2_height >= 2 && log2_height <= 5));
+  assert((log2_width >= 1 && log2_width <= 5) && (log2_height >= 1 && log2_height <= 5));
 
   refs->filtered_initialized = false;
   uvg_pixel *out_left_ref = &refs->ref.left[0];
@@ -1057,7 +1058,13 @@ void uvg_intra_build_reference_any(
   // Generate left reference.
   if (luma_px->x > 0) {
     // Get the number of reference pixels based on the PU coordinate within the LCU.
-    int px_available_left = num_ref_pixels_left[lcu_px.y / 4][lcu_px.x / 4] >> is_chroma;
+    int px_available_left;
+    if (is_isp && !is_chroma) {
+      px_available_left = height;
+    }
+    else {
+      px_available_left = num_ref_pixels_left[lcu_px.y / 4][lcu_px.x / 4] >> is_chroma;
+    }
 
     // Limit the number of available pixels based on block size and dimensions
     // of the picture.
@@ -1161,10 +1168,16 @@ void uvg_intra_build_reference_any(
   }
 
   // Generate top reference.
+  int px_available_top;
   if (luma_px->y > 0) {
     // Get the number of reference pixels based on the PU coordinate within the LCU.
-    int px_available_top = num_ref_pixels_top[lcu_px.y / 4][lcu_px.x / 4] >> is_chroma;
-
+    if (is_isp && !is_chroma) {
+      px_available_top = width;
+    }
+    else {
+      px_available_top = num_ref_pixels_top[lcu_px.y / 4][lcu_px.x / 4] >> is_chroma;
+    }
+    
     // Limit the number of available pixels based on block size and dimensions
     // of the picture.
     px_available_top = MIN(px_available_top, width * 2 + multi_ref_index);
@@ -1197,7 +1210,8 @@ void uvg_intra_build_reference_inner(
   uvg_intra_references *const refs,
   bool entropy_sync,
   const uint8_t multi_ref_idx,
-  uvg_pixel* extra_ref_lines)
+  uvg_pixel* extra_ref_lines,
+  bool is_isp)
 {
   const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
   const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
@@ -1310,8 +1324,14 @@ void uvg_intra_build_reference_inner(
   }
   // Generate left reference.
 
-// Get the number of reference pixels based on the PU coordinate within the LCU.
-  int px_available_left = num_ref_pixels_left[lcu_px.y / 4][lcu_px.x / 4] >> is_chroma;
+  // Get the number of reference pixels based on the PU coordinate within the LCU.
+  int px_available_left;
+  if (is_isp && !is_chroma) {
+    px_available_left = height;
+  }
+  else {
+    px_available_left = num_ref_pixels_left[lcu_px.y / 4][lcu_px.x / 4] >> is_chroma;
+  }
 
   // Limit the number of available pixels based on block size and dimensions
   // of the picture.
@@ -1347,7 +1367,13 @@ void uvg_intra_build_reference_inner(
   // Generate top reference.
 
   // Get the number of reference pixels based on the PU coordinate within the LCU.
-  int px_available_top = num_ref_pixels_top[lcu_px.y / 4][lcu_px.x / 4] >> is_chroma;
+  int px_available_top;
+  if (is_isp && !is_chroma) {
+    px_available_top = width;
+  }
+  else {
+    px_available_top = num_ref_pixels_top[lcu_px.y / 4][lcu_px.x / 4] >> is_chroma;
+  }
 
   // Limit the number of available pixels based on block size and dimensions
   // of the picture.
@@ -1373,7 +1399,46 @@ void uvg_intra_build_reference_inner(
   }
 }
 
+void uvg_intra_build_reference_isp(
+  const cu_loc_t* const split_loc,
+  const cu_loc_t* const origin,
+  const color_t color,
+  const vector2d_t* const luma_px,
+  const vector2d_t* const pic_px,
+  const lcu_t* const lcu,
+  uvg_intra_references* const refs,
+  bool entropy_sync,
+  const int isp_mode)
+{
+  int ref_length_top = 0, ref_length_left = 0;
+
+  bool left_available  = split_loc->x > 0;
+  bool above_available = split_loc->y > 0;
+
+  if (split_loc->x == origin->x && split_loc->y == origin->y)
+  {
+    // First ISP split, call reference builders normally
+    if (luma_px->x > 0 && luma_px->y > 0) {
+      uvg_intra_build_reference_inner(split_loc, color, luma_px, pic_px, lcu, refs, entropy_sync, 0, NULL, false);
+    }
+    else {
+      uvg_intra_build_reference_any(split_loc, color, luma_px, pic_px, lcu, refs, 0, NULL, false);
+    }
+    
+  }
+  else
+  {
+    if (luma_px->x > 0 && luma_px->y > 0) {
+      uvg_intra_build_reference_inner(split_loc, color, luma_px, pic_px, lcu, refs, entropy_sync, 0, NULL, true);
+    }
+    else {
+      uvg_intra_build_reference_any(split_loc, color, luma_px, pic_px, lcu, refs, 0, NULL, true);
+    }
+  }
+}
+
 void uvg_intra_build_reference(
+  const cu_loc_t* const pu_loc,
   const cu_loc_t* const cu_loc,
   const color_t color,
   const vector2d_t *const luma_px,
@@ -1386,11 +1451,19 @@ void uvg_intra_build_reference(
 {
   assert(!(extra_ref_lines == NULL && multi_ref_idx != 0) && "Trying to use MRL with NULL extra references.");
 
+  bool is_isp = (pu_loc->x != cu_loc->x) || (pu_loc->y != cu_loc->y);
+
+  // If isp is in use, some extra logic is needed
+  if (is_isp) {
+    uvg_intra_build_reference_isp(pu_loc, cu_loc, color, luma_px, pic_px, lcu, refs, entropy_sync, is_isp);
+    return;
+  }
+
   // Much logic can be discarded if not on the edge
   if (luma_px->x > 0 && luma_px->y > 0) {
-    uvg_intra_build_reference_inner(cu_loc, color, luma_px, pic_px, lcu, refs, entropy_sync, multi_ref_idx, extra_ref_lines);
+    uvg_intra_build_reference_inner(pu_loc, color, luma_px, pic_px, lcu, refs, entropy_sync, multi_ref_idx, extra_ref_lines, is_isp);
   } else {
-    uvg_intra_build_reference_any(cu_loc, color, luma_px, pic_px, lcu, refs, multi_ref_idx, extra_ref_lines);
+    uvg_intra_build_reference_any(pu_loc, color, luma_px, pic_px, lcu, refs, multi_ref_idx, extra_ref_lines, is_isp);
   }
 }
 
@@ -1538,6 +1611,7 @@ void uvg_get_isp_split_loc(cu_loc_t *loc, const int x, const int y, const int bl
 
 static void intra_recon_tb_leaf(
   encoder_state_t* const state,
+  const cu_loc_t* pu_loc,
   const cu_loc_t* cu_loc,
   lcu_t *lcu,
   color_t color,
@@ -1586,7 +1660,7 @@ static void intra_recon_tb_leaf(
     }
   }
 
-  uvg_intra_build_reference(cu_loc, color, &luma_px, &pic_px, lcu, &refs, cfg->wpp, extra_refs, multi_ref_index);
+  uvg_intra_build_reference(pu_loc, cu_loc, color, &luma_px, &pic_px, lcu, &refs, cfg->wpp, extra_refs, multi_ref_index);
 
   uvg_pixel pred[32 * 32];
   uvg_intra_predict(state, &refs, cu_loc, color, pred, search_data, lcu, tree_type);
@@ -1697,12 +1771,14 @@ void uvg_intra_recon_cu(
     // Small blocks are split only twice.
     int split_type = search_data->pred_cu.intra.isp_mode;
     int split_limit = uvg_get_isp_split_num(width, height, split_type);
+    cu_loc_t origin_cu;
+    uvg_cu_loc_ctor(&origin_cu, x, y, width, height);
 
     for (int i = 0; i < split_limit; ++i) {
       cu_loc_t split_loc;
       uvg_get_isp_split_loc(&split_loc, x, y, width, height, i, split_type);
 
-      intra_recon_tb_leaf(state, &split_loc, lcu, COLOR_Y, search_data, tree_type);
+      intra_recon_tb_leaf(state, &split_loc, &origin_cu, lcu, COLOR_Y, search_data, tree_type);
       uvg_quantize_lcu_residual(state, true, false, false,
         &split_loc, depth, cur_cu, lcu,
         false, tree_type);
@@ -1717,11 +1793,11 @@ void uvg_intra_recon_cu(
    
   // Process a leaf TU.
   if (has_luma) {
-    intra_recon_tb_leaf(state, &loc, lcu, COLOR_Y, search_data, tree_type);
+    intra_recon_tb_leaf(state, &loc, &loc, lcu, COLOR_Y, search_data, tree_type);
   }
   if (has_chroma) {
-    intra_recon_tb_leaf(state, &loc, lcu, COLOR_U, search_data, tree_type);
-    intra_recon_tb_leaf(state, &loc, lcu, COLOR_V, search_data, tree_type);
+    intra_recon_tb_leaf(state, &loc, &loc, lcu, COLOR_U, search_data, tree_type);
+    intra_recon_tb_leaf(state, &loc, &loc, lcu, COLOR_V, search_data, tree_type);
   }
 
   // TODO: not necessary to call if only luma and ISP is on
diff --git a/src/intra.h b/src/intra.h
index 51ed41c9..c59ec497 100644
--- a/src/intra.h
+++ b/src/intra.h
@@ -108,6 +108,7 @@ int8_t uvg_intra_get_dir_luma_predictor(
 * \param multi_ref_idx Multi reference line index for the prediction block.
 */
 void uvg_intra_build_reference(
+  const cu_loc_t* const pu_loc,
   const cu_loc_t* const cu_loc,
   const color_t color,
   const vector2d_t *const luma_px,
diff --git a/src/search_intra.c b/src/search_intra.c
index 5f0b3669..46db2f80 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -709,10 +709,10 @@ static int search_intra_chroma_rough(
   const cu_loc_t loc = { luma_px.x, luma_px.y, width, height, width, height };
 
   uvg_intra_references refs_u;
-  uvg_intra_build_reference(&loc, COLOR_U, &luma_px, &pic_px, lcu, &refs_u, state->encoder_control->cfg.wpp, NULL, 0);
+  uvg_intra_build_reference(&loc, &loc, COLOR_U, &luma_px, &pic_px, lcu, &refs_u, state->encoder_control->cfg.wpp, NULL, 0);
 
   uvg_intra_references refs_v;
-  uvg_intra_build_reference(&loc, COLOR_V, &luma_px, &pic_px, lcu, &refs_v, state->encoder_control->cfg.wpp, NULL, 0);
+  uvg_intra_build_reference(&loc, &loc, COLOR_V, &luma_px, &pic_px, lcu, &refs_v, state->encoder_control->cfg.wpp, NULL, 0);
 
   vector2d_t lcu_cpx = { (lcu_px->x & ~7) / 2, (lcu_px->y & ~7) / 2 };
   uvg_pixel* orig_u = &lcu->ref.u[lcu_cpx.x + lcu_cpx.y * LCU_WIDTH_C];
@@ -1514,8 +1514,8 @@ int8_t uvg_search_intra_chroma_rdo(
 
 
   if (reconstruct_chroma) {
-    uvg_intra_build_reference(&loc, COLOR_U, &luma_px, &pic_px, lcu, &refs[0], state->encoder_control->cfg.wpp, NULL, 0);
-    uvg_intra_build_reference(&loc, COLOR_V, &luma_px, &pic_px, lcu, &refs[1], state->encoder_control->cfg.wpp, NULL, 0);
+    uvg_intra_build_reference(&loc, &loc, COLOR_U, &luma_px, &pic_px, lcu, &refs[0], state->encoder_control->cfg.wpp, NULL, 0);
+    uvg_intra_build_reference(&loc, &loc, COLOR_V, &luma_px, &pic_px, lcu, &refs[1], state->encoder_control->cfg.wpp, NULL, 0);
     
     const vector2d_t lcu_px = { SUB_SCU(x_px), SUB_SCU(y_px) };
     cabac_data_t temp_cabac;
@@ -1858,7 +1858,7 @@ void uvg_search_cu_intra(
   int8_t num_cand = uvg_intra_get_dir_luma_predictor(x_px, y_px, candidate_modes, cur_cu, left_cu, above_cu);
 
   if (depth > 0) {
-    uvg_intra_build_reference(&cu_loc, COLOR_Y, &luma_px, &pic_px, lcu, refs, state->encoder_control->cfg.wpp, NULL, 0);
+    uvg_intra_build_reference(&cu_loc, &cu_loc, COLOR_Y, &luma_px, &pic_px, lcu, refs, state->encoder_control->cfg.wpp, NULL, 0);
   }
 
   // The maximum number of possible MIP modes depend on block size & shape
@@ -1926,7 +1926,7 @@ void uvg_search_cu_intra(
           frame->rec->stride, 1);
       }
     }
-    uvg_intra_build_reference(&cu_loc, COLOR_Y, &luma_px, &pic_px, lcu, &refs[line], state->encoder_control->cfg.wpp, extra_refs, line);
+    uvg_intra_build_reference(&cu_loc, &cu_loc, COLOR_Y, &luma_px, &pic_px, lcu, &refs[line], state->encoder_control->cfg.wpp, extra_refs, line);
     for(int i = 1; i < INTRA_MPM_COUNT; i++) {
       num_mrl_modes++;
       const int index = (i - 1) + (INTRA_MPM_COUNT -1)*(line-1) + number_of_modes;
diff --git a/src/strategies/generic/quant-generic.c b/src/strategies/generic/quant-generic.c
index 3de27958..4215fc81 100644
--- a/src/strategies/generic/quant-generic.c
+++ b/src/strategies/generic/quant-generic.c
@@ -456,8 +456,10 @@ int uvg_quantize_residual_generic(encoder_state_t *const state,
 
   int has_coeffs = 0;
 
-  assert(width <= TR_MAX_WIDTH && height <= TR_MAX_WIDTH);
-  assert(width >= TR_MIN_WIDTH && height >= TR_MIN_WIDTH);
+  // With ISP these checks no longer apply, since width and height 2 is now possible
+  // With MTT even 1x16 and 16x1 ISP splits are possible
+  //assert(width <= TR_MAX_WIDTH && height <= TR_MAX_WIDTH);
+  //assert(width >= TR_MIN_WIDTH && height >= TR_MIN_WIDTH);
 
   // Get residual. (ref_in - pred_in -> residual)
   uvg_generate_residual(ref_in, pred_in, residual, width, height, in_stride, in_stride);

From 3e23fd0601ce715527fb531579fe3fe76b29ab1a Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Mon, 5 Sep 2022 11:10:55 +0300
Subject: [PATCH 057/254] [cabac] fix cbf_y context for tr splits

---
 src/encode_coding_tree.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index 9b0bf884..0fe488f9 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -724,7 +724,9 @@ static void encode_transform_coeff(
   if ((cur_cu->type == CU_INTRA || tr_depth > 0 || cb_flag_u || cb_flag_v) && !only_chroma && tree_type != UVG_CHROMA_T) {
       cabac->cur_ctx = &(cabac->ctx.qt_cbf_model_luma[*luma_cbf_ctx]);
       CABAC_BIN(cabac, cb_flag_y, "cbf_luma");
-      *luma_cbf_ctx = 2 + cb_flag_y;
+      if (tr_depth == 0) {
+        *luma_cbf_ctx = 2 + cb_flag_y;
+      }
   }
 
   if (cb_flag_y | cb_flag_u | cb_flag_v) {

From cb7f9919e3398090a51bdbba68df958dca549715 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Mon, 5 Sep 2022 11:20:09 +0300
Subject: [PATCH 058/254] [jccr] Fix jccr coefficient copying

---
 src/transform.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/transform.c b/src/transform.c
index 8d0c3645..825df5b2 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -1257,6 +1257,17 @@ static void quantize_tr_residual(
         tree_type
       );
       cur_pu->joint_cb_cr = has_coeffs;
+      if (has_coeffs) {
+        for (int j = 0; j < tr_height; ++j) {
+          memcpy(&dst_coeff[j * lcu_width], &coeff[j * tr_width], tr_width * sizeof(coeff_t));
+        }
+        cbf_set(&cur_pu->cbf, depth, color);
+      }
+      else {
+        for (int j = 0; j < tr_height; ++j) {
+          memset(&dst_coeff[j * lcu_width], 0, (sizeof(coeff_t) * tr_width));
+        }
+      }
       return;
     }
 

From e0e96068ccc41cd634093d7cc29a854df6e571a2 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Mon, 5 Sep 2022 12:04:21 +0300
Subject: [PATCH 059/254] [lfnst] lfnst is not allowed for transform split

---
 src/encode_coding_tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index 0fe488f9..be66d8f3 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -114,7 +114,7 @@ bool uvg_is_lfnst_allowed(
   const color_t color,
   const lcu_t* lcu) 
 {
-  if (state->encoder_control->cfg.lfnst && pred_cu->type == CU_INTRA) {
+  if (state->encoder_control->cfg.lfnst && pred_cu->type == CU_INTRA && pred_cu->depth == pred_cu->tr_depth) {
     const int isp_mode = pred_cu->intra.isp_mode;
     const int depth = pred_cu->depth;
     const int chroma_width = width >> 1;

From 88c33c048969fe7b067bc7e7150bba39bbe841fd Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Mon, 5 Sep 2022 16:51:23 +0300
Subject: [PATCH 060/254] [lfnst] Fix lfnst constraint checking for the new
 coeff order

---
 src/search_intra.c | 30 +++++++++++++++---------------
 src/transform.c    | 13 ++++++++++---
 src/transform.h    |  4 +++-
 3 files changed, 28 insertions(+), 19 deletions(-)

diff --git a/src/search_intra.c b/src/search_intra.c
index 46db2f80..d5e8574b 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -205,13 +205,14 @@ static void derive_mts_constraints(cu_info_t *const pred_cu,
   const uint32_t * const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_idx, log2_block_width, log2_block_height);
   const uint32_t * const scan_cg = uvg_get_scan_order_table(SCAN_GROUP_UNGROUPED, scan_idx, log2_block_width, log2_block_height);
 
-  const coeff_t* coeff = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, lcu_px.x, lcu_px.y)];
+  coeff_t coeff_y[TR_MAX_WIDTH * TR_MAX_WIDTH];
+  uvg_get_sub_coeff(coeff_y, lcu->coeff.y, lcu_px.x, lcu_px.y, width, height, LCU_WIDTH);
 
   signed scan_cg_last = -1;
   signed scan_pos_last = -1;
 
   for (int i = 0; i < width * height; i++) {
-    if (coeff[scan[i]]) {
+    if (coeff_y[scan[i]]) {
       scan_pos_last = i;
       sig_coeffgroup_flag[scan_cg[i >> log2_cg_size]] = 1;
     }
@@ -420,10 +421,11 @@ static double search_intra_trdepth(
             pred_cu,
             depth,
             constraints,
-            &lcu->coeff.y[scan_offset],
+            lcu->coeff.y,
             width,
-            height
-            );
+            height,
+            &lcu_px,
+            COLOR_Y);
         }
 
         if (!constraints[1] && cbf_is_set(pred_cu->cbf, depth, COLOR_Y)) {
@@ -493,10 +495,6 @@ static double search_intra_trdepth(
         pred_cu->intra.mode_chroma = chroma_mode;
         pred_cu->joint_cb_cr = 4;
         // TODO: Maybe check the jccr mode here also but holy shit is the interface of search_intra_rdo bad currently
-        const unsigned scan_offset = xy_to_zorder(
-          LCU_WIDTH_C,
-          lcu_px.x,
-          lcu_px.y);
         uvg_intra_recon_cu(
           state,
           x_px,
@@ -526,10 +524,11 @@ static double search_intra_trdepth(
             pred_cu,
             depth,
             constraints,
-            &lcu->coeff.u[scan_offset],
+            lcu->coeff.u,
             width_c,
-            width_c
-            );
+            width_c,
+            &lcu_px,
+            COLOR_U);
           if (constraints[0] || !constraints[1]) {
             best_lfnst_idx = 0;
             continue;
@@ -538,10 +537,11 @@ static double search_intra_trdepth(
             pred_cu,
             depth,
             constraints,
-            &lcu->coeff.u[scan_offset],
+            lcu->coeff.u,
             width_c,
-            width_c
-            );
+            width_c,
+            &lcu_px,
+            COLOR_U);
           if (constraints[0] || !constraints[1]) {
             best_lfnst_idx = 0;
             continue;
diff --git a/src/transform.c b/src/transform.c
index 825df5b2..8b903579 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -178,7 +178,9 @@ void uvg_derive_lfnst_constraints(
   bool* constraints,
   const coeff_t* coeff,
   const int width,
-  const int height)
+  const int height,
+  const vector2d_t * const lcu_px,
+  color_t color)
 {
   coeff_scan_order_t scan_idx = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode, depth);
   // ToDo: large block support in VVC?
@@ -187,6 +189,11 @@ void uvg_derive_lfnst_constraints(
   const uint32_t* scan = uvg_g_sig_last_scan[scan_idx][log2_block_size - 1];
 
   signed scan_pos_last = -1;
+  coeff_t temp[TR_MAX_WIDTH * TR_MAX_WIDTH];
+  if(lcu_px != NULL) {
+    uvg_get_sub_coeff(temp, coeff, lcu_px->x, lcu_px->y, width, height, color == COLOR_Y? LCU_WIDTH : LCU_WIDTH_C);
+    coeff = temp;
+  }
 
   for (int i = 0; i < width * height; i++) {
     if (coeff[scan[i]]) {
@@ -576,9 +583,9 @@ void uvg_chroma_transform_search(
     
     if(pred_cu->type == CU_INTRA && transforms[i] != CHROMA_TS && (depth == 4 || tree_type == UVG_CHROMA_T)) {
       bool constraints[2] = { false, false };
-      uvg_derive_lfnst_constraints(pred_cu, depth, constraints, u_quant_coeff, width, height);
+      uvg_derive_lfnst_constraints(pred_cu, depth, constraints, u_quant_coeff, width, height, NULL, COLOR_U);
       if(!IS_JCCR_MODE(transforms[i])) {
-        uvg_derive_lfnst_constraints(pred_cu, depth, constraints, v_quant_coeff, width, height);
+        uvg_derive_lfnst_constraints(pred_cu, depth, constraints, v_quant_coeff, width, height, NULL, COLOR_V);
       }
       if (!constraints[1] && (u_has_coeffs || v_has_coeffs) && pred_cu->cr_lfnst_idx != 0) continue;
     }
diff --git a/src/transform.h b/src/transform.h
index 78a2325a..6fdef411 100644
--- a/src/transform.h
+++ b/src/transform.h
@@ -78,7 +78,9 @@ void uvg_derive_lfnst_constraints(
   bool* constraints,
   const coeff_t* coeff,
   const int width,
-  const int height);
+  const int height,
+  const vector2d_t * const ,
+  color_t color);
 
 typedef struct {
   double best_u_cost;

From 6340dfe4ce303719e805fa80e31ae4054d3841c7 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Mon, 5 Sep 2022 16:56:22 +0300
Subject: [PATCH 061/254] [isp] Fix mistake in pu_loc argument passing, was not
 used after passing.

---
 src/intra.c | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/src/intra.c b/src/intra.c
index 12af3362..640b3c48 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -1410,11 +1410,6 @@ void uvg_intra_build_reference_isp(
   bool entropy_sync,
   const int isp_mode)
 {
-  int ref_length_top = 0, ref_length_left = 0;
-
-  bool left_available  = split_loc->x > 0;
-  bool above_available = split_loc->y > 0;
-
   if (split_loc->x == origin->x && split_loc->y == origin->y)
   {
     // First ISP split, call reference builders normally
@@ -1621,11 +1616,11 @@ static void intra_recon_tb_leaf(
   const uvg_config *cfg = &state->encoder_control->cfg;
   const int shift = color == COLOR_Y ? 0 : 1;
 
-  const int x = cu_loc->x;
-  const int y = cu_loc->y;
+  const int x = pu_loc->x;
+  const int y = pu_loc->y;
   
-  const int width  = color == COLOR_Y ? cu_loc->width  : cu_loc->chroma_width;
-  const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
+  const int width  = color == COLOR_Y ? pu_loc->width  : pu_loc->chroma_width;
+  const int height = color == COLOR_Y ? pu_loc->height : pu_loc->chroma_height;
   int log2_width =  uvg_g_convert_to_log2[width];
   int log2_height = uvg_g_convert_to_log2[height];
 
@@ -1663,7 +1658,7 @@ static void intra_recon_tb_leaf(
   uvg_intra_build_reference(pu_loc, cu_loc, color, &luma_px, &pic_px, lcu, &refs, cfg->wpp, extra_refs, multi_ref_index);
 
   uvg_pixel pred[32 * 32];
-  uvg_intra_predict(state, &refs, cu_loc, color, pred, search_data, lcu, tree_type);
+  uvg_intra_predict(state, &refs, pu_loc, color, pred, search_data, lcu, tree_type);
 
   const int index = lcu_px.x + lcu_px.y * lcu_width;
   uvg_pixel *block = NULL;

From a261d4c5b3387e2c8b2d88e8bafee86c57986b78 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Mon, 5 Sep 2022 18:14:59 +0300
Subject: [PATCH 062/254] [isp] WIP

---
 src/intra.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/intra.c b/src/intra.c
index 640b3c48..9a16bec0 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -1446,9 +1446,10 @@ void uvg_intra_build_reference(
 {
   assert(!(extra_ref_lines == NULL && multi_ref_idx != 0) && "Trying to use MRL with NULL extra references.");
 
+  // This will be false for first ISP split
   bool is_isp = (pu_loc->x != cu_loc->x) || (pu_loc->y != cu_loc->y);
 
-  // If isp is in use, some extra logic is needed
+  // If isp is in use, some extra logic is needed. For first split, old reference builders can be used.
   if (is_isp) {
     uvg_intra_build_reference_isp(pu_loc, cu_loc, color, luma_px, pic_px, lcu, refs, entropy_sync, is_isp);
     return;

From 08942a53944859150119ecdcbe3207d7e74705bd Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Tue, 6 Sep 2022 08:49:22 +0300
Subject: [PATCH 063/254] [tr-skip] fix transform skip flag writing

---
 src/encode_coding_tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index be66d8f3..96feb36c 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -558,7 +558,7 @@ static void encode_transform_unit(
     // CoeffNxN
     // Residual Coding
 
-    if(state->encoder_control->cfg.trskip_enable && width <= (1 << state->encoder_control->cfg.trskip_max_size && !(cur_pu->type == CU_INTRA && cur_pu->intra.isp_mode != ISP_MODE_NO_ISP))) {
+    if(state->encoder_control->cfg.trskip_enable && width <= (1 << state->encoder_control->cfg.trskip_max_size) && !(cur_pu->type == CU_INTRA && cur_pu->intra.isp_mode != ISP_MODE_NO_ISP)) {
       cabac->cur_ctx = &cabac->ctx.transform_skip_model_luma;
       CABAC_BIN(cabac, cur_pu->tr_idx == MTS_SKIP, "transform_skip_flag");
       DBG_YUVIEW_VALUE(state->frame->poc, DBG_YUVIEW_TR_SKIP, x, y, width, width, (cur_pu->tr_idx == MTS_SKIP) ? 1 : 0);

From 662f31d61d67f25b8c90e352fe5295d431281058 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Tue, 6 Sep 2022 09:30:46 +0300
Subject: [PATCH 064/254] [isp] Use correct coordinates for depth 4 chroma tu
 cost calculation

---
 src/search.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/search.c b/src/search.c
index 5dad1b3b..ff426dda 100644
--- a/src/search.c
+++ b/src/search.c
@@ -688,6 +688,7 @@ static double cu_rd_cost_tr_split_accurate(
   unsigned chroma_ssd = 0;
   if(has_chroma) {
     const vector2d_t lcu_px = { (x_px >> (tree_type != UVG_CHROMA_T)) & ~3, (y_px >> (tree_type != UVG_CHROMA_T)) &~3  };
+    uvg_cu_loc_ctor(&loc, lcu_px.x, lcu_px.y, width, height);
     const int chroma_width  = MAX(4, LCU_WIDTH >> (depth + 1));
     const int chroma_height = chroma_width; // TODO: height for non-square blocks
     int8_t scan_order = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode_chroma, depth);

From 4794104eccfa241fbb02b0f29298a2d7dbcb25b4 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Thu, 8 Sep 2022 12:06:22 +0300
Subject: [PATCH 065/254] [isp] Fix errors in reference building. Use cubic
 filter during prediction if ISP enabled.

---
 src/intra.c                            | 182 +++++++++++++++----------
 src/search_intra.c                     |  12 +-
 src/strategies/avx2/intra-avx2.c       |  11 +-
 src/strategies/generic/intra-generic.c |   8 +-
 src/strategies/strategies-intra.h      |   3 +-
 5 files changed, 136 insertions(+), 80 deletions(-)

diff --git a/src/intra.c b/src/intra.c
index 9a16bec0..61d479d6 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -918,7 +918,8 @@ static void intra_predict_regular(
   int_fast8_t mode,
   color_t color,
   uvg_pixel *dst,
-  const uint8_t multi_ref_idx)
+  const uint8_t multi_ref_idx,
+  const uint8_t isp_mode)
 {
   const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
   const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
@@ -928,6 +929,7 @@ static void intra_predict_regular(
 
   // MRL only for luma
   uint8_t multi_ref_index = color == COLOR_Y ? multi_ref_idx : 0;
+  uint8_t isp = color == COLOR_Y ? isp_mode : 0;
 
   const uvg_intra_ref *used_ref = &refs->ref;
   if (cfg->intra_smoothing_disabled || color != COLOR_Y || mode == 1 || (width == 4 && height == 4) || multi_ref_index || width != height /*ISP_TODO: replace this fake ISP check*/) {
@@ -963,7 +965,7 @@ static void intra_predict_regular(
   } else if (mode == 1) {
     intra_pred_dc(cu_loc, color, used_ref->top, used_ref->left, dst, multi_ref_index);
   } else {
-    uvg_angular_pred(cu_loc, mode, color, used_ref->top, used_ref->left, dst, multi_ref_index);
+    uvg_angular_pred(cu_loc, mode, color, used_ref->top, used_ref->left, dst, multi_ref_index, isp);
   }
 
   // pdpc
@@ -976,7 +978,25 @@ static void intra_predict_regular(
 }
 
 
+int get_isp_ref_pixels_num(const int lcu_x, const int lcu_y, const int width, const int height, const int isp_mode)
+{
+  // TODO: this only works until non-square blocks are implemented
+  const int block_size = MAX(width, height);
+  const int split_size = MIN(width, height);
+  if (isp_mode == ISP_MODE_HOR) {
+    int ref_pix_left = LCU_WIDTH - lcu_y;
+  }
+  else if (isp_mode == ISP_MODE_VER) {
+
+  }
+  else {
+    assert(false && "This should never trigger.");
+  }
+}
+
+
 void uvg_intra_build_reference_any(
+  const cu_loc_t* const pu_loc,
   const cu_loc_t* const cu_loc,
   const color_t color,
   const vector2d_t *const luma_px,
@@ -985,13 +1005,21 @@ void uvg_intra_build_reference_any(
   uvg_intra_references *const refs,
   const uint8_t multi_ref_idx,
   uvg_pixel *extra_ref_lines,
-  bool is_isp)
+  const uint8_t isp_mode)
 {
-  const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
-  const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
+  const int width  = color == COLOR_Y ? pu_loc->width  : pu_loc->chroma_width;
+  const int height = color == COLOR_Y ? pu_loc->height : pu_loc->chroma_height;
   const int log2_width =  uvg_g_convert_to_log2[width];
   const int log2_height = uvg_g_convert_to_log2[height];
 
+  // These are only used with ISP, so no need to check chroma
+  const int cu_width  = cu_loc->width;
+  const int cu_height = cu_loc->height;
+  const int pu_x = pu_loc->x;
+  const int pu_y = pu_loc->y;
+  const int cu_x = cu_loc->x;
+  const int cu_y = cu_loc->y;
+
   assert((log2_width >= 1 && log2_width <= 5) && (log2_height >= 1 && log2_height <= 5));
 
   refs->filtered_initialized = false;
@@ -1059,8 +1087,19 @@ void uvg_intra_build_reference_any(
   if (luma_px->x > 0) {
     // Get the number of reference pixels based on the PU coordinate within the LCU.
     int px_available_left;
-    if (is_isp && !is_chroma) {
-      px_available_left = height;
+    if (isp_mode && !is_chroma) {
+      if (isp_mode == ISP_MODE_VER) {
+        px_available_left = height;
+      }
+      else {
+        // Left LCU edge has more pixels available
+        if (lcu_px.x > 0) {
+          px_available_left = cu_height - (pu_y - cu_y);
+        }
+        else {
+          px_available_left = LCU_WIDTH - lcu_px.y;
+        }
+      }
     }
     else {
       px_available_left = num_ref_pixels_left[lcu_px.y / 4][lcu_px.x / 4] >> is_chroma;
@@ -1069,7 +1108,7 @@ void uvg_intra_build_reference_any(
     // Limit the number of available pixels based on block size and dimensions
     // of the picture.
     // TODO: height for non-square blocks
-    px_available_left = MIN(px_available_left, height * 2 + multi_ref_index);
+    px_available_left = MIN(px_available_left, cu_height * 2 + multi_ref_index);
     px_available_left = MIN(px_available_left, (pic_px->y - luma_px->y) >> is_chroma);
 
     // Copy pixels from coded CUs.
@@ -1079,7 +1118,7 @@ void uvg_intra_build_reference_any(
     }
     // Extend the last pixel for the rest of the reference values.
     uvg_pixel nearest_pixel = left_border[(px_available_left - 1) * left_stride];
-    for (int i = px_available_left; i < height * 2 + multi_ref_index * 2; ++i) {
+    for (int i = px_available_left; i < cu_height * 2 + multi_ref_index * 2; ++i) {
       out_left_ref[i + 1 + multi_ref_index] = nearest_pixel;
     }
   } else {
@@ -1171,8 +1210,18 @@ void uvg_intra_build_reference_any(
   int px_available_top;
   if (luma_px->y > 0) {
     // Get the number of reference pixels based on the PU coordinate within the LCU.
-    if (is_isp && !is_chroma) {
-      px_available_top = width;
+    if (isp_mode && !is_chroma) {
+      if (isp_mode == ISP_MODE_HOR) {
+        px_available_top = width;
+      }
+      else {
+        if (lcu_px.y > 0) {
+          px_available_top = LCU_WIDTH - lcu_px.x;
+        }
+        else {
+          px_available_top = LCU_WIDTH;
+        }
+      }
     }
     else {
       px_available_top = num_ref_pixels_top[lcu_px.y / 4][lcu_px.x / 4] >> is_chroma;
@@ -1180,7 +1229,7 @@ void uvg_intra_build_reference_any(
     
     // Limit the number of available pixels based on block size and dimensions
     // of the picture.
-    px_available_top = MIN(px_available_top, width * 2 + multi_ref_index);
+    px_available_top = MIN(px_available_top, cu_width * 2 + multi_ref_index);
     px_available_top = MIN(px_available_top, (pic_px->x - luma_px->x) >> is_chroma);
 
     // Copy all the pixels we can.
@@ -1195,13 +1244,14 @@ void uvg_intra_build_reference_any(
   } else {
     // Extend nearest pixel.
     uvg_pixel nearest_pixel = luma_px->x > 0 ? left_border[0] : dc_val;
-    for (int i = 0; i < width * 2 + multi_ref_index; i++) {
+    for (int i = 0; i < cu_width * 2 + multi_ref_index; i++) {
       out_top_ref[i + 1] = nearest_pixel;
     }
   }
 }
 
 void uvg_intra_build_reference_inner(
+  const cu_loc_t* const pu_loc,
   const cu_loc_t* const cu_loc,
   const color_t color,
   const vector2d_t *const luma_px,
@@ -1211,13 +1261,21 @@ void uvg_intra_build_reference_inner(
   bool entropy_sync,
   const uint8_t multi_ref_idx,
   uvg_pixel* extra_ref_lines,
-  bool is_isp)
+  uint8_t isp_mode)
 {
-  const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
-  const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
+  const int width  = color == COLOR_Y ? pu_loc->width  : pu_loc->chroma_width;
+  const int height = color == COLOR_Y ? pu_loc->height : pu_loc->chroma_height;
+  const int cu_width  = color == COLOR_Y ? cu_loc->width  : cu_loc->chroma_width;
+  const int cu_height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
   const int log2_width =  uvg_g_convert_to_log2[width];
   const int log2_height = uvg_g_convert_to_log2[height];
 
+  // These are only used with ISP, so no need to check chroma
+  const int pu_x = pu_loc->x;
+  const int pu_y = pu_loc->y;
+  const int cu_x = cu_loc->x;
+  const int cu_y = cu_loc->y;
+
   assert((log2_width >= 2 && log2_width <= 5) && (log2_height >= 2 && log2_height <= 5));
 
   refs->filtered_initialized = false;
@@ -1326,8 +1384,20 @@ void uvg_intra_build_reference_inner(
 
   // Get the number of reference pixels based on the PU coordinate within the LCU.
   int px_available_left;
-  if (is_isp && !is_chroma) {
-    px_available_left = height;
+  if (isp_mode && !is_chroma) {
+    if (isp_mode == ISP_MODE_VER) {
+      px_available_left = height;
+    }
+    else {
+      // Left LCU edge has more pixels available
+      if (lcu_px.x > 0) {
+        px_available_left = cu_height - (pu_y - cu_y);
+      }
+      else {
+        px_available_left = LCU_WIDTH - lcu_px.y;
+      }
+    }
+
   }
   else {
     px_available_left = num_ref_pixels_left[lcu_px.y / 4][lcu_px.x / 4] >> is_chroma;
@@ -1335,7 +1405,7 @@ void uvg_intra_build_reference_inner(
 
   // Limit the number of available pixels based on block size and dimensions
   // of the picture.
-  px_available_left = MIN(px_available_left, width * 2);
+  px_available_left = MIN(px_available_left, cu_height * 2);
   px_available_left = MIN(px_available_left, (pic_px->y - luma_px->y) >> is_chroma);
 
   // Copy pixels from coded CUs.
@@ -1350,7 +1420,7 @@ void uvg_intra_build_reference_inner(
 
   // Extend the last pixel for the rest of the reference values.
   uvg_pixel nearest_pixel = out_left_ref[i];
-  for (; i < width * 2; i += 4) {
+  for (; i < cu_height * 2; i += 4) {
     out_left_ref[i + 1] = nearest_pixel;
     out_left_ref[i + 2] = nearest_pixel;
     out_left_ref[i + 3] = nearest_pixel;
@@ -1368,8 +1438,18 @@ void uvg_intra_build_reference_inner(
 
   // Get the number of reference pixels based on the PU coordinate within the LCU.
   int px_available_top;
-  if (is_isp && !is_chroma) {
-    px_available_top = width;
+  if (isp_mode && !is_chroma) {
+    if (isp_mode == ISP_MODE_HOR) {
+      px_available_top = width;
+    }
+    else {
+      if (lcu_px.y > 0) {
+        px_available_top = LCU_WIDTH - lcu_px.x;
+      }
+      else {
+        px_available_top = LCU_WIDTH;
+      }
+    }
   }
   else {
     px_available_top = num_ref_pixels_top[lcu_px.y / 4][lcu_px.x / 4] >> is_chroma;
@@ -1377,7 +1457,7 @@ void uvg_intra_build_reference_inner(
 
   // Limit the number of available pixels based on block size and dimensions
   // of the picture.
-  px_available_top = MIN(px_available_top, width * 2 + multi_ref_index);
+  px_available_top = MIN(px_available_top, cu_width * 2 + multi_ref_index);
   px_available_top = MIN(px_available_top, (pic_px->x - luma_px->x) >> is_chroma);
 
   if (entropy_sync && px.y == 0) px_available_top = MIN(px_available_top, ((LCU_WIDTH >> is_chroma) - px.x) -1);
@@ -1391,7 +1471,7 @@ void uvg_intra_build_reference_inner(
 
   // Extend the last pixel for the rest of the reference values.
   nearest_pixel = out_top_ref[i + multi_ref_index];
-  for (; i < (width + multi_ref_index) * 2; i += 4) {
+  for (; i < (cu_width + multi_ref_index) * 2; i += 4) {
     out_top_ref[i + 1 + multi_ref_index] = nearest_pixel;
     out_top_ref[i + 2 + multi_ref_index] = nearest_pixel;
     out_top_ref[i + 3 + multi_ref_index] = nearest_pixel;
@@ -1399,38 +1479,6 @@ void uvg_intra_build_reference_inner(
   }
 }
 
-void uvg_intra_build_reference_isp(
-  const cu_loc_t* const split_loc,
-  const cu_loc_t* const origin,
-  const color_t color,
-  const vector2d_t* const luma_px,
-  const vector2d_t* const pic_px,
-  const lcu_t* const lcu,
-  uvg_intra_references* const refs,
-  bool entropy_sync,
-  const int isp_mode)
-{
-  if (split_loc->x == origin->x && split_loc->y == origin->y)
-  {
-    // First ISP split, call reference builders normally
-    if (luma_px->x > 0 && luma_px->y > 0) {
-      uvg_intra_build_reference_inner(split_loc, color, luma_px, pic_px, lcu, refs, entropy_sync, 0, NULL, false);
-    }
-    else {
-      uvg_intra_build_reference_any(split_loc, color, luma_px, pic_px, lcu, refs, 0, NULL, false);
-    }
-    
-  }
-  else
-  {
-    if (luma_px->x > 0 && luma_px->y > 0) {
-      uvg_intra_build_reference_inner(split_loc, color, luma_px, pic_px, lcu, refs, entropy_sync, 0, NULL, true);
-    }
-    else {
-      uvg_intra_build_reference_any(split_loc, color, luma_px, pic_px, lcu, refs, 0, NULL, true);
-    }
-  }
-}
 
 void uvg_intra_build_reference(
   const cu_loc_t* const pu_loc,
@@ -1442,24 +1490,19 @@ void uvg_intra_build_reference(
   uvg_intra_references *const refs,
   bool entropy_sync,
   uvg_pixel *extra_ref_lines,
-  uint8_t multi_ref_idx)
+  uint8_t multi_ref_idx,
+  const uint8_t isp_mode)
 {
   assert(!(extra_ref_lines == NULL && multi_ref_idx != 0) && "Trying to use MRL with NULL extra references.");
 
-  // This will be false for first ISP split
-  bool is_isp = (pu_loc->x != cu_loc->x) || (pu_loc->y != cu_loc->y);
-
-  // If isp is in use, some extra logic is needed. For first split, old reference builders can be used.
-  if (is_isp) {
-    uvg_intra_build_reference_isp(pu_loc, cu_loc, color, luma_px, pic_px, lcu, refs, entropy_sync, is_isp);
-    return;
-  }
+  bool first_split = color == COLOR_Y && isp_mode && pu_loc->x == cu_loc->x && pu_loc->y == cu_loc->y;
+  uint8_t isp = first_split ? 0 : isp_mode;
 
   // Much logic can be discarded if not on the edge
   if (luma_px->x > 0 && luma_px->y > 0) {
-    uvg_intra_build_reference_inner(pu_loc, color, luma_px, pic_px, lcu, refs, entropy_sync, multi_ref_idx, extra_ref_lines, is_isp);
+    uvg_intra_build_reference_inner(pu_loc, cu_loc, color, luma_px, pic_px, lcu, refs, entropy_sync, multi_ref_idx, extra_ref_lines, isp);
   } else {
-    uvg_intra_build_reference_any(pu_loc, color, luma_px, pic_px, lcu, refs, multi_ref_idx, extra_ref_lines, is_isp);
+    uvg_intra_build_reference_any(pu_loc, cu_loc, color, luma_px, pic_px, lcu, refs, multi_ref_idx, extra_ref_lines, isp);
   }
 }
 
@@ -1498,7 +1541,7 @@ void uvg_intra_predict(
       mip_predict(state, refs, width, height, dst, intra_mode, data->pred_cu.intra.mip_is_transposed);
     }
     else {
-      intra_predict_regular(state, refs, cu_loc, intra_mode, color, dst, data->pred_cu.intra.multi_ref_idx);
+      intra_predict_regular(state, refs, cu_loc, intra_mode, color, dst, data->pred_cu.intra.multi_ref_idx, data->pred_cu.intra.isp_mode);
     }
   }
   else {
@@ -1636,6 +1679,7 @@ static void intra_recon_tb_leaf(
   int y_scu = SUB_SCU(y);
   const vector2d_t lcu_px = {x_scu >> shift, y_scu >> shift };
   uint8_t multi_ref_index = color == COLOR_Y ? search_data->pred_cu.intra.multi_ref_idx: 0;
+  uint8_t isp_mode = color == COLOR_Y ? search_data->pred_cu.intra.isp_mode : 0;
 
   uvg_intra_references refs;
   // Extra reference lines for use with MRL. Extra lines needed only for left edge.
@@ -1656,7 +1700,7 @@ static void intra_recon_tb_leaf(
     }
   }
 
-  uvg_intra_build_reference(pu_loc, cu_loc, color, &luma_px, &pic_px, lcu, &refs, cfg->wpp, extra_refs, multi_ref_index);
+  uvg_intra_build_reference(pu_loc, cu_loc, color, &luma_px, &pic_px, lcu, &refs, cfg->wpp, extra_refs, multi_ref_index, isp_mode);
 
   uvg_pixel pred[32 * 32];
   uvg_intra_predict(state, &refs, pu_loc, color, pred, search_data, lcu, tree_type);
diff --git a/src/search_intra.c b/src/search_intra.c
index d5e8574b..cb485414 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -709,10 +709,10 @@ static int search_intra_chroma_rough(
   const cu_loc_t loc = { luma_px.x, luma_px.y, width, height, width, height };
 
   uvg_intra_references refs_u;
-  uvg_intra_build_reference(&loc, &loc, COLOR_U, &luma_px, &pic_px, lcu, &refs_u, state->encoder_control->cfg.wpp, NULL, 0);
+  uvg_intra_build_reference(&loc, &loc, COLOR_U, &luma_px, &pic_px, lcu, &refs_u, state->encoder_control->cfg.wpp, NULL, 0, 0);
 
   uvg_intra_references refs_v;
-  uvg_intra_build_reference(&loc, &loc, COLOR_V, &luma_px, &pic_px, lcu, &refs_v, state->encoder_control->cfg.wpp, NULL, 0);
+  uvg_intra_build_reference(&loc, &loc, COLOR_V, &luma_px, &pic_px, lcu, &refs_v, state->encoder_control->cfg.wpp, NULL, 0, 0);
 
   vector2d_t lcu_cpx = { (lcu_px->x & ~7) / 2, (lcu_px->y & ~7) / 2 };
   uvg_pixel* orig_u = &lcu->ref.u[lcu_cpx.x + lcu_cpx.y * LCU_WIDTH_C];
@@ -1514,8 +1514,8 @@ int8_t uvg_search_intra_chroma_rdo(
 
 
   if (reconstruct_chroma) {
-    uvg_intra_build_reference(&loc, &loc, COLOR_U, &luma_px, &pic_px, lcu, &refs[0], state->encoder_control->cfg.wpp, NULL, 0);
-    uvg_intra_build_reference(&loc, &loc, COLOR_V, &luma_px, &pic_px, lcu, &refs[1], state->encoder_control->cfg.wpp, NULL, 0);
+    uvg_intra_build_reference(&loc, &loc, COLOR_U, &luma_px, &pic_px, lcu, &refs[0], state->encoder_control->cfg.wpp, NULL, 0, 0);
+    uvg_intra_build_reference(&loc, &loc, COLOR_V, &luma_px, &pic_px, lcu, &refs[1], state->encoder_control->cfg.wpp, NULL, 0, 0);
     
     const vector2d_t lcu_px = { SUB_SCU(x_px), SUB_SCU(y_px) };
     cabac_data_t temp_cabac;
@@ -1858,7 +1858,7 @@ void uvg_search_cu_intra(
   int8_t num_cand = uvg_intra_get_dir_luma_predictor(x_px, y_px, candidate_modes, cur_cu, left_cu, above_cu);
 
   if (depth > 0) {
-    uvg_intra_build_reference(&cu_loc, &cu_loc, COLOR_Y, &luma_px, &pic_px, lcu, refs, state->encoder_control->cfg.wpp, NULL, 0);
+    uvg_intra_build_reference(&cu_loc, &cu_loc, COLOR_Y, &luma_px, &pic_px, lcu, refs, state->encoder_control->cfg.wpp, NULL, 0, 0);
   }
 
   // The maximum number of possible MIP modes depend on block size & shape
@@ -1926,7 +1926,7 @@ void uvg_search_cu_intra(
           frame->rec->stride, 1);
       }
     }
-    uvg_intra_build_reference(&cu_loc, &cu_loc, COLOR_Y, &luma_px, &pic_px, lcu, &refs[line], state->encoder_control->cfg.wpp, extra_refs, line);
+    uvg_intra_build_reference(&cu_loc, &cu_loc, COLOR_Y, &luma_px, &pic_px, lcu, &refs[line], state->encoder_control->cfg.wpp, extra_refs, line, 0);
     for(int i = 1; i < INTRA_MPM_COUNT; i++) {
       num_mrl_modes++;
       const int index = (i - 1) + (INTRA_MPM_COUNT -1)*(line-1) + number_of_modes;
diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c
index b25c626c..57fee201 100644
--- a/src/strategies/avx2/intra-avx2.c
+++ b/src/strategies/avx2/intra-avx2.c
@@ -59,7 +59,8 @@ static void uvg_angular_pred_avx2(
   const uvg_pixel *const in_ref_above,
   const uvg_pixel *const in_ref_left,
   uvg_pixel *const dst,
-  const uint8_t multi_ref_idx)
+  const uint8_t multi_ref_idx,
+  const uint8_t isp_mode)
 {
   // ISP_TODO: non-square block implementation, height is passed but not used
   const int width = channel_type == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
@@ -72,6 +73,7 @@ static void uvg_angular_pred_avx2(
 
   // TODO: implement handling of MRL
   uint8_t multi_ref_index = channel_type == COLOR_Y ? multi_ref_idx : 0;
+  uint8_t isp = isp_mode;
 
   __m256i p_shuf_01 = _mm256_setr_epi8(
     0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04,
@@ -292,6 +294,13 @@ static void uvg_angular_pred_avx2(
               f[yy][2] = 16 + offset;
               f[yy][3] = offset;
             }
+            // Cubic must be used if ref line != 0 or if isp mode != 0
+            if (multi_ref_index || isp) {
+              use_cubic = true;
+            }
+            const int16_t filter_coeff[4] = { 16 - (delta_fract[yy] >> 1), 32 - (delta_fract[yy] >> 1), 16 + (delta_fract[yy] >> 1), delta_fract[yy] >> 1 };
+            const int16_t *temp_f = use_cubic ? cubic_filter[delta_fract[yy]] : filter_coeff;
+            memcpy(f[yy], temp_f, 4 * sizeof(*temp_f));
           }
 
           // Do 4-tap intra interpolation filtering
diff --git a/src/strategies/generic/intra-generic.c b/src/strategies/generic/intra-generic.c
index faf476e1..d3d3bda4 100644
--- a/src/strategies/generic/intra-generic.c
+++ b/src/strategies/generic/intra-generic.c
@@ -58,7 +58,8 @@ static void uvg_angular_pred_generic(
   const uvg_pixel *const in_ref_above,
   const uvg_pixel *const in_ref_left,
   uvg_pixel *const dst,
-  const uint8_t multi_ref_idx)
+  const uint8_t multi_ref_idx,
+  const uint8_t isp_mode)
 {
   const int width  = channel_type == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
   const int height = channel_type == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
@@ -119,6 +120,7 @@ static void uvg_angular_pred_generic(
   uint32_t pred_mode = intra_mode; // ToDo: handle WAIP
 
   uint8_t multi_ref_index = multi_ref_idx;
+  uint8_t isp = isp_mode;
 
   // Whether to swap references to always project on the left reference row.
   const bool vertical_mode = intra_mode >= 34;
@@ -246,8 +248,8 @@ static void uvg_angular_pred_generic(
               use_cubic = false;
             }
           }
-          // Cubic must be used if ref line != 0
-          if (multi_ref_index) {
+          // Cubic must be used if ref line != 0 or if isp mode is != 0
+          if (multi_ref_index || isp) {
             use_cubic = true;
           }
           const int16_t filter_coeff[4] = { 16 - (delta_fract >> 1), 32 - (delta_fract >> 1), 16 + (delta_fract >> 1), delta_fract >> 1 };
diff --git a/src/strategies/strategies-intra.h b/src/strategies/strategies-intra.h
index 9708a3d8..ce008d01 100644
--- a/src/strategies/strategies-intra.h
+++ b/src/strategies/strategies-intra.h
@@ -51,7 +51,8 @@ typedef void (angular_pred_func)(
   const uvg_pixel *const in_ref_above,
   const uvg_pixel *const in_ref_left,
   uvg_pixel *const dst,
-  const uint8_t multi_ref_idx);
+  const uint8_t multi_ref_idx,
+  const uint8_t isp_mode);
 
 typedef void (intra_pred_planar_func)(
   const cu_loc_t* const cu_loc,

From a28e61eff72ad878c4fb81f4f1fd82930972dc87 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Mon, 12 Sep 2022 14:59:08 +0300
Subject: [PATCH 066/254] [isp] Fix CI errors.

---
 src/encode_coding_tree.c                        |  7 ++++---
 src/encode_coding_tree.h                        |  2 +-
 src/intra.c                                     | 17 -----------------
 src/intra.h                                     |  3 ++-
 src/strategies/avx2/encode_coding_tree-avx2.h   |  2 +-
 .../generic/encode_coding_tree-generic.c        |  2 +-
 .../generic/encode_coding_tree-generic.h        |  2 +-
 src/strategies/strategies-encode.h              |  2 +-
 8 files changed, 11 insertions(+), 26 deletions(-)

diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index 96feb36c..0a867bb6 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -465,7 +465,7 @@ void uvg_encode_last_significant_xy(cabac_data_t * const cabac,
 
 static void encode_chroma_tu(
   encoder_state_t* const state,
-  cu_loc_t *cu_loc,
+  const cu_loc_t *cu_loc,
   int depth,
   cu_info_t* cur_pu,
   int8_t* scan_idx,
@@ -1880,12 +1880,13 @@ void uvg_encode_mvd(encoder_state_t * const state,
  * \param lcu_width   LCU_WIDTH for luma, LCU_WIDTH_C for chroma.
  *
  */
-void uvg_get_sub_coeff(coeff_t *dst, const coeff_t * const src, const int lcu_x, const int lcu_y, const int block_w, const int block_h, const int lcu_width)
+void uvg_get_sub_coeff(const coeff_t *dst, const coeff_t * const src, const int lcu_x, const int lcu_y, const int block_w, const int block_h, const int lcu_width)
 {
   // Take subset of coeff array
+  coeff_t* dst_ptr = dst;
   const coeff_t* coeff_ptr = &src[lcu_x + lcu_y * lcu_width];
   for (int j = 0; j < block_h; ++j) {
     //memcpy(dst_coeff + (j * lcu_width), &coeff[j * tr_width], tr_width * sizeof(coeff_t));
-    memcpy(&dst[j * block_w], &coeff_ptr[j * lcu_width], block_w * sizeof(coeff_t));
+    memcpy(&dst_ptr[j * block_w], &coeff_ptr[j * lcu_width], block_w * sizeof(coeff_t));
   }
 }
diff --git a/src/encode_coding_tree.h b/src/encode_coding_tree.h
index 5b4ce324..575f4afd 100644
--- a/src/encode_coding_tree.h
+++ b/src/encode_coding_tree.h
@@ -117,7 +117,7 @@ void uvg_encode_last_significant_xy(cabac_data_t * const cabac,
   uint8_t width, uint8_t height,
   uint8_t type, uint8_t scan, double* bits_out);
 
-void uvg_get_sub_coeff(coeff_t* dst, const coeff_t* const src, 
+void uvg_get_sub_coeff(const coeff_t* dst, const coeff_t* const src, 
                        const int lcu_x, const int lcu_y, 
                        const int block_w, const int block_h, 
                        const int lcu_width);
diff --git a/src/intra.c b/src/intra.c
index 61d479d6..51a53f15 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -978,23 +978,6 @@ static void intra_predict_regular(
 }
 
 
-int get_isp_ref_pixels_num(const int lcu_x, const int lcu_y, const int width, const int height, const int isp_mode)
-{
-  // TODO: this only works until non-square blocks are implemented
-  const int block_size = MAX(width, height);
-  const int split_size = MIN(width, height);
-  if (isp_mode == ISP_MODE_HOR) {
-    int ref_pix_left = LCU_WIDTH - lcu_y;
-  }
-  else if (isp_mode == ISP_MODE_VER) {
-
-  }
-  else {
-    assert(false && "This should never trigger.");
-  }
-}
-
-
 void uvg_intra_build_reference_any(
   const cu_loc_t* const pu_loc,
   const cu_loc_t* const cu_loc,
diff --git a/src/intra.h b/src/intra.h
index c59ec497..f324c4fa 100644
--- a/src/intra.h
+++ b/src/intra.h
@@ -117,7 +117,8 @@ void uvg_intra_build_reference(
   uvg_intra_references *const refs,
   bool entropy_sync,
   uvg_pixel *extra_refs,
-  uint8_t multi_ref_idx);
+  uint8_t multi_ref_idx,
+  const uint8_t isp_mode);
 
 /**
  * \brief Generate intra predictions.
diff --git a/src/strategies/avx2/encode_coding_tree-avx2.h b/src/strategies/avx2/encode_coding_tree-avx2.h
index fa4ec8d5..ea7f077e 100644
--- a/src/strategies/avx2/encode_coding_tree-avx2.h
+++ b/src/strategies/avx2/encode_coding_tree-avx2.h
@@ -45,7 +45,7 @@
 void uvg_encode_coeff_nxn_avx2(encoder_state_t * const state,
                                cabac_data_t * const cabac,
                                const coeff_t *coeff,
-                               cu_loc_t *loc,
+                               const cu_loc_t *loc,
                                uint8_t type,
                                int8_t scan_mode,
                                int8_t tr_skip,
diff --git a/src/strategies/generic/encode_coding_tree-generic.c b/src/strategies/generic/encode_coding_tree-generic.c
index d0c87f4a..8d9ca61d 100644
--- a/src/strategies/generic/encode_coding_tree-generic.c
+++ b/src/strategies/generic/encode_coding_tree-generic.c
@@ -54,7 +54,7 @@
 void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
   cabac_data_t * const cabac,
   const coeff_t *coeff,
-  cu_loc_t *cu_loc,
+  const cu_loc_t *cu_loc,
   uint8_t color,
   int8_t scan_mode,
   cu_info_t* cur_cu,
diff --git a/src/strategies/generic/encode_coding_tree-generic.h b/src/strategies/generic/encode_coding_tree-generic.h
index 09255deb..26682a61 100644
--- a/src/strategies/generic/encode_coding_tree-generic.h
+++ b/src/strategies/generic/encode_coding_tree-generic.h
@@ -44,7 +44,7 @@
 void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
                                   cabac_data_t * const cabac,
                                   const coeff_t *coeff,
-                                  cu_loc_t *loc,
+                                  const cu_loc_t *loc,
                                   uint8_t color,
                                   int8_t scan_mode,
                                   cu_info_t* cur_cu,
diff --git a/src/strategies/strategies-encode.h b/src/strategies/strategies-encode.h
index 2bffacca..625f4005 100644
--- a/src/strategies/strategies-encode.h
+++ b/src/strategies/strategies-encode.h
@@ -49,7 +49,7 @@
 typedef unsigned (encode_coeff_nxn_func)(encoder_state_t * const state,
                                          cabac_data_t * const cabac,
                                          const coeff_t *coeff,
-                                         cu_loc_t *loc,
+                                         const cu_loc_t *loc,
                                          uint8_t color,
                                          int8_t scan_mode,
                                          cu_info_t* cur_cu,

From 7ba557af6b6cd9839f82855e35f35fe3279c9e05 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Tue, 13 Sep 2022 13:27:16 +0300
Subject: [PATCH 067/254] [isp] Fix luma cbf writing for ISP splits. Do not
 write luma cbf if first three splits had luma cbf 0.

---
 src/encode_coding_tree.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index 0a867bb6..096e4f5c 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -622,6 +622,7 @@ static void encode_transform_coeff(
   lcu_coeff_t* coeff,
   enum uvg_tree_type tree_type,
   bool last_split,
+  bool can_skip_last_cbf,
   int *luma_cbf_ctx,            // Always true except when writing sub partition coeffs (ISP)
   cu_loc_t *original_loc)       // Original dimensions before ISP split
 {
@@ -631,6 +632,8 @@ static void encode_transform_coeff(
   const int width = cu_loc->width;
   const int height = cu_loc->height;
 
+  bool isp_split = cu_loc->x != original_loc->x || cu_loc->y != original_loc->y;
+
   //const encoder_control_t *const ctrl = state->encoder_control;
   const videoframe_t * const frame = state->tile->frame;
   const cu_array_t* used_array = tree_type != UVG_CHROMA_T ? frame->cu_array : frame->chroma_cu_array;
@@ -710,7 +713,7 @@ static void encode_transform_coeff(
         cu_loc_t loc;
         uvg_cu_loc_ctor(&loc, (x + i * split_width), (y + j * split_height), width >> 1, height >> 1);
 
-        encode_transform_coeff(state, &loc, depth + 1, tr_depth + 1, cb_flag_u, cb_flag_v, only_chroma, coeff, tree_type, true, luma_cbf_ctx, &loc);
+        encode_transform_coeff(state, &loc, depth + 1, tr_depth + 1, cb_flag_u, cb_flag_v, only_chroma, coeff, tree_type, true, false, luma_cbf_ctx, &loc);
       }
     }
     return;
@@ -722,11 +725,15 @@ static void encode_transform_coeff(
   // - we have chroma coefficients at this level
   // When it is not present, it is inferred to be 1.
   if ((cur_cu->type == CU_INTRA || tr_depth > 0 || cb_flag_u || cb_flag_v) && !only_chroma && tree_type != UVG_CHROMA_T) {
+    if (can_skip_last_cbf && isp_split && last_split) {
+      // Do not write luma cbf if first three isp splits have luma cbf 0
+    } else {
       cabac->cur_ctx = &(cabac->ctx.qt_cbf_model_luma[*luma_cbf_ctx]);
       CABAC_BIN(cabac, cb_flag_y, "cbf_luma");
       if (tr_depth == 0) {
         *luma_cbf_ctx = 2 + cb_flag_y;
       }
+    }
   }
 
   if (cb_flag_y | cb_flag_u | cb_flag_v) {
@@ -1621,7 +1628,7 @@ void uvg_encode_coding_tree(
       // Code (possible) coeffs to bitstream
       if (cbf) {
         int luma_cbf_ctx = 0;
-        encode_transform_coeff(state, &cu_loc, depth, 0, 0, 0, 0, coeff, tree_type, true, &luma_cbf_ctx, &cu_loc);
+        encode_transform_coeff(state, &cu_loc, depth, 0, 0, 0, 0, coeff, tree_type, true, false, &luma_cbf_ctx, &cu_loc);
       }
 
       encode_mts_idx(state, cabac, cur_cu);
@@ -1645,13 +1652,16 @@ void uvg_encode_coding_tree(
       int split_type = cur_cu->intra.isp_mode;
       int split_limit = split_type == ISP_MODE_NO_ISP ? 1 : uvg_get_isp_split_num(cu_width, cu_height, split_type);
       luma_cbf_ctx = split_limit != 1 ? 2 : 0;
+      // If all first three splits have luma cbf 0, the last one must be one. Since the value ca be derived, no need to write it
+      bool can_skip_last_cbf = true;
       for (int i = 0; i < split_limit; ++i) {
         cu_loc_t split_loc;
         uvg_get_isp_split_loc(&split_loc, x, y, cu_width, cu_height, i, split_type);
 
         // Check if last split to write chroma
         bool last_split = (i + 1) == split_limit;
-        encode_transform_coeff(state, &split_loc, depth, 0, 0, 0, 0, coeff, tree_type, last_split, &luma_cbf_ctx, &cu_loc);
+        encode_transform_coeff(state, &split_loc, depth, 0, 0, 0, 0, coeff, tree_type, last_split, can_skip_last_cbf, &luma_cbf_ctx, &cu_loc);
+        can_skip_last_cbf &= luma_cbf_ctx == 2;
       }
     }
 
@@ -1670,7 +1680,7 @@ void uvg_encode_coding_tree(
       tmp->violates_lfnst_constrained_luma = false;
       tmp->violates_lfnst_constrained_chroma = false;
       tmp->lfnst_last_scan_pos = false;
-      encode_transform_coeff(state, &cu_loc, depth, 0, 0, 0, 1, coeff, tree_type, true, &luma_cbf_ctx, &cu_loc);
+      encode_transform_coeff(state, &cu_loc, depth, 0, 0, 0, 1, coeff, tree_type, true, false, &luma_cbf_ctx, &cu_loc);
       // Write LFNST only once for single tree structure
       encode_lfnst_idx(state, cabac, tmp, x, y, depth, cu_width, cu_height, tree_type, COLOR_UV);
     }

From 910501012f8182f54b6023e7ef0d0cc48c52e240 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Tue, 13 Sep 2022 16:02:17 +0300
Subject: [PATCH 068/254] [isp] Fix referene building for depth 2 blocks. Flip
 horizontal mode dimensions during prediction. Fix reference length during
 prediction when ISP enabled.

---
 src/intra.c                            | 30 ++----------
 src/strategies/generic/intra-generic.c | 67 ++++++++++++++++----------
 2 files changed, 46 insertions(+), 51 deletions(-)

diff --git a/src/intra.c b/src/intra.c
index 51a53f15..3c02dac2 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -1075,13 +1075,7 @@ void uvg_intra_build_reference_any(
         px_available_left = height;
       }
       else {
-        // Left LCU edge has more pixels available
-        if (lcu_px.x > 0) {
-          px_available_left = cu_height - (pu_y - cu_y);
-        }
-        else {
-          px_available_left = LCU_WIDTH - lcu_px.y;
-        }
+        px_available_left = num_ref_pixels_left[lcu_px.y / 4][lcu_px.x / 4];
       }
     }
     else {
@@ -1198,12 +1192,7 @@ void uvg_intra_build_reference_any(
         px_available_top = width;
       }
       else {
-        if (lcu_px.y > 0) {
-          px_available_top = LCU_WIDTH - lcu_px.x;
-        }
-        else {
-          px_available_top = LCU_WIDTH;
-        }
+        px_available_top = num_ref_pixels_top[lcu_px.y / 4][lcu_px.x / 4];
       }
     }
     else {
@@ -1372,13 +1361,7 @@ void uvg_intra_build_reference_inner(
       px_available_left = height;
     }
     else {
-      // Left LCU edge has more pixels available
-      if (lcu_px.x > 0) {
-        px_available_left = cu_height - (pu_y - cu_y);
-      }
-      else {
-        px_available_left = LCU_WIDTH - lcu_px.y;
-      }
+      px_available_left = num_ref_pixels_left[lcu_px.y / 4][lcu_px.x / 4];
     }
 
   }
@@ -1426,12 +1409,7 @@ void uvg_intra_build_reference_inner(
       px_available_top = width;
     }
     else {
-      if (lcu_px.y > 0) {
-        px_available_top = LCU_WIDTH - lcu_px.x;
-      }
-      else {
-        px_available_top = LCU_WIDTH;
-      }
+      px_available_top = num_ref_pixels_top[lcu_px.y / 4][lcu_px.x / 4];
     }
   }
   else {
diff --git a/src/strategies/generic/intra-generic.c b/src/strategies/generic/intra-generic.c
index d3d3bda4..1891ee5a 100644
--- a/src/strategies/generic/intra-generic.c
+++ b/src/strategies/generic/intra-generic.c
@@ -138,6 +138,10 @@ static void uvg_angular_pred_generic(
   // Pointer for the other reference.
   const uvg_pixel *ref_side;
 
+  const int cu_dim = MAX(width, height);
+  const int top_ref_length  = isp_mode ? width + cu_dim  : width << 1;
+  const int left_ref_lenght = isp_mode ? height + cu_dim : height << 1;
+
   // Set ref_main and ref_side such that, when indexed with 0, they point to
   // index 0 in block coordinates.
   if (sample_disp < 0) {
@@ -192,8 +196,8 @@ static void uvg_angular_pred_generic(
       temp_main[i] = (vertical_mode ? in_ref_above[i] : in_ref_left[i]);
       temp_side[i] = (vertical_mode ? in_ref_left[i] : in_ref_above[i]);
     }*/
-    memcpy(&temp_above[0], &in_ref_above[0], ((width << 1) + 1 + multi_ref_index) * sizeof(uvg_pixel));
-    memcpy(&temp_left[0], &in_ref_left[0], ((height << 1) + 1 + multi_ref_index) * sizeof(uvg_pixel));
+    memcpy(&temp_above[0], &in_ref_above[0], (top_ref_length + 1 + multi_ref_index) * sizeof(uvg_pixel));
+    memcpy(&temp_left[0], &in_ref_left[0], (left_ref_lenght + 1 + multi_ref_index) * sizeof(uvg_pixel));
 
     ref_main = vertical_mode ? temp_above : temp_left;
     ref_side = vertical_mode ? temp_left : temp_above;
@@ -202,12 +206,17 @@ static void uvg_angular_pred_generic(
     const int log2_ratio = log2_width - log2_height;
     const int s = MAX(0, vertical_mode ? log2_ratio : -log2_ratio);
     const int max_index = (multi_ref_index << s) + 2;
-    const int ref_length = vertical_mode ? width << 1 : height << 1;
+    int ref_length;
+    if (isp_mode) {
+      ref_length = vertical_mode ? top_ref_length : left_ref_lenght;
+    }
+    else {
+      ref_length = vertical_mode ? width << 1 : height << 1;
+    }
     const uvg_pixel val = ref_main[ref_length + multi_ref_index];
     for (int j = 1; j <= max_index; j++) {
       ref_main[ref_length + multi_ref_index +  j] = val;
     }
-
     
     //// sample_disp >= 0 means we don't need to refer to negative indices,
     //// which means we can just use the references as is.
@@ -221,6 +230,14 @@ static void uvg_angular_pred_generic(
     //tmp_ref[width + last_index] = tmp_ref[width + last_index - 1];
   }
 
+  // Flip dimensions for horizontal modes
+  int tmp_width = vertical_mode ? width : height;
+  int tmp_height = vertical_mode ? height : width;
+
+  uvg_pixel tmp_dst[LCU_WIDTH * LCU_WIDTH];
+  uvg_pixel* dst_buf = vertical_mode ? dst : tmp_dst;
+
+
   // compensate for line offset in reference line buffers
   ref_main += multi_ref_index;
   ref_side += multi_ref_index;
@@ -228,7 +245,7 @@ static void uvg_angular_pred_generic(
   if (sample_disp != 0) {
     // The mode is not horizontal or vertical, we have to do interpolation.
 
-    for (int_fast32_t y = 0, delta_pos = sample_disp * (1 + multi_ref_index); y < height; ++y, delta_pos += sample_disp) {
+    for (int_fast32_t y = 0, delta_pos = sample_disp * (1 + multi_ref_index); y < tmp_height; ++y, delta_pos += sample_disp) {
       int_fast32_t delta_int = delta_pos >> 5;
       int_fast32_t delta_fract = delta_pos & (32 - 1);
 
@@ -255,36 +272,36 @@ static void uvg_angular_pred_generic(
           const int16_t filter_coeff[4] = { 16 - (delta_fract >> 1), 32 - (delta_fract >> 1), 16 + (delta_fract >> 1), delta_fract >> 1 };
           int16_t const * const f = use_cubic ? cubic_filter[delta_fract] : filter_coeff;
           // Do 4-tap intra interpolation filtering
-          for (int_fast32_t x = 0; x < width; x++, ref_main_index++) {
+          for (int_fast32_t x = 0; x < tmp_width; x++, ref_main_index++) {
             p[0] = ref_main[ref_main_index];
             p[1] = ref_main[ref_main_index + 1];
             p[2] = ref_main[ref_main_index + 2];
             p[3] = ref_main[ref_main_index + 3];
          
-            dst[y * width + x] = CLIP_TO_PIXEL(((int32_t)(f[0] * p[0]) + (int32_t)(f[1] * p[1]) + (int32_t)(f[2] * p[2]) + (int32_t)(f[3] * p[3]) + 32) >> 6);
+            dst_buf[y * tmp_width + x] = CLIP_TO_PIXEL(((int32_t)(f[0] * p[0]) + (int32_t)(f[1] * p[1]) + (int32_t)(f[2] * p[2]) + (int32_t)(f[3] * p[3]) + 32) >> 6);
 
           }
         }
         else {
         
           // Do linear filtering
-          for (int_fast32_t x = 0; x < width; ++x) {
+          for (int_fast32_t x = 0; x < tmp_width; ++x) {
             uvg_pixel ref1 = ref_main[x + delta_int + 1];
             uvg_pixel ref2 = ref_main[x + delta_int + 2];
-            dst[y * width + x] = ref1 + ((delta_fract * (ref2-ref1) + 16) >> 5);
+            dst_buf[y * tmp_width + x] = ref1 + ((delta_fract * (ref2-ref1) + 16) >> 5);
           }
         }
       }
       else {
         // Just copy the integer samples
-        for (int_fast32_t x = 0; x < width; x++) {
-          dst[y * width + x] = ref_main[x + delta_int + 1];
+        for (int_fast32_t x = 0; x < tmp_width; x++) {
+          dst_buf[y * tmp_width + x] = ref_main[x + delta_int + 1];
         }
       }
 
      
       // PDPC
-      bool PDPC_filter = (width >= 4 || channel_type != 0);
+      bool PDPC_filter = (tmp_width >= 4 || channel_type != 0);
       if (pred_mode > 1 && pred_mode < 67) {
         if (mode_disp < 0 || multi_ref_index) { // Cannot be used with MRL.
           PDPC_filter = false;
@@ -295,12 +312,12 @@ static void uvg_angular_pred_generic(
       }
       if(PDPC_filter) {
         int inv_angle_sum = 256;
-        for (int x = 0; x < MIN(3 << scale, width); x++) {
+        for (int x = 0; x < MIN(3 << scale, tmp_width); x++) {
           inv_angle_sum += modedisp2invsampledisp[abs(mode_disp)];
 
           int wL = 32 >> (2 * x >> scale);
           const uvg_pixel left = ref_side[y + (inv_angle_sum >> 9) + 1];
-          dst[y * width + x] = dst[y * width + x] + ((wL * (left - dst[y * width + x]) + 32) >> 6);
+          dst_buf[y * tmp_width + x] = dst_buf[y * tmp_width + x] + ((wL * (left - dst_buf[y * tmp_width + x]) + 32) >> 6);
         }
       }
 
@@ -342,32 +359,32 @@ static void uvg_angular_pred_generic(
     
     // Do not apply PDPC if multi ref line index is other than 0
     // TODO: do not do PDPC if block is in BDPCM mode
-    bool do_pdpc = (((width >= 4 && height >= 4) || channel_type != 0) && sample_disp >= 0 && multi_ref_index == 0 /*&& !bdpcm*/);
+    bool do_pdpc = (((tmp_width >= 4 && tmp_height >= 4) || channel_type != 0) && sample_disp >= 0 && multi_ref_index == 0 /*&& !bdpcm*/);
 
     if (do_pdpc) {
       int scale = (log2_width + log2_height - 2) >> 2;
       const uvg_pixel top_left = ref_main[0];
-      for (int_fast32_t y = 0; y < height; ++y) {
-        memcpy(&dst[y * width], &ref_main[1], width * sizeof(uvg_pixel));
+      for (int_fast32_t y = 0; y < tmp_height; ++y) {
+        memcpy(&dst_buf[y * tmp_width], &ref_main[1], tmp_width * sizeof(uvg_pixel));
         const uvg_pixel left = ref_side[1 + y];
-        for (int_fast32_t x = 0; x < MIN(3 << scale, width); ++x) {
+        for (int_fast32_t x = 0; x < MIN(3 << scale, tmp_width); ++x) {
           const int wL = 32 >> (2 * x >> scale);
-          const uvg_pixel val = dst[y * width + x];
-          dst[y * width + x] = CLIP_TO_PIXEL(val + ((wL * (left - top_left) + 32) >> 6));
+          const uvg_pixel val = dst_buf[y * tmp_width + x];
+          dst_buf[y * tmp_width + x] = CLIP_TO_PIXEL(val + ((wL * (left - top_left) + 32) >> 6));
         }
       }
     } else {
-      for (int_fast32_t y = 0; y < height; ++y) {
-        memcpy(&dst[y * width], &ref_main[1], width * sizeof(uvg_pixel));
+      for (int_fast32_t y = 0; y < tmp_height; ++y) {
+        memcpy(&dst_buf[y * tmp_width], &ref_main[1], tmp_width * sizeof(uvg_pixel));
       }
     }
   }
 
   // Flip the block if this is was a horizontal mode.
   if (!vertical_mode) {
-    for (int_fast32_t y = 0; y < height - 1; ++y) {
-      for (int_fast32_t x = y + 1; x < width; ++x) {
-        SWAP(dst[y * width + x], dst[x * height + y], uvg_pixel);
+    for (int_fast32_t y = 0; y < tmp_height; ++y) {
+      for (int_fast32_t x = 0; x < tmp_width; ++x) {
+        dst[x * width + y] = tmp_dst[y * tmp_width + x];
       }
     }
   }

From d39fddf0d89fb63a54905b24e2effea1e95a1a10 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Wed, 14 Sep 2022 16:54:53 +0300
Subject: [PATCH 069/254] [isp] Implement DCT for small blocks.

---
 src/strategies/generic/dct-generic.c   | 108 ++++++++++++++++++++-----
 src/strategies/generic/intra-generic.c |   3 +-
 src/transform.c                        |   4 +-
 3 files changed, 95 insertions(+), 20 deletions(-)

diff --git a/src/strategies/generic/dct-generic.c b/src/strategies/generic/dct-generic.c
index 03453b90..507ed174 100644
--- a/src/strategies/generic/dct-generic.c
+++ b/src/strategies/generic/dct-generic.c
@@ -771,6 +771,12 @@ static void fast_inverse_dst_4x4_generic(int8_t bitdepth, const int16_t* input,
 
 
 // DCT-2
+#define DEFINE_DCT2_P2_MATRIX(a) \
+{ \
+   a,  a, \
+   a, -a  \
+}
+
 #define DEFINE_DCT2_P4_MATRIX(a,b,c) \
 { \
    a,  a,  a,  a, \
@@ -1002,6 +1008,7 @@ static void fast_inverse_dst_4x4_generic(int8_t bitdepth, const int16_t* input,
 }
 
 // DCT-2
+const int16_t uvg_g_DCT2P2[4] = DEFINE_DCT2_P2_MATRIX(64);
 const int16_t uvg_g_DCT2P4[16] = DEFINE_DCT2_P4_MATRIX(64, 83, 36);
 const int16_t uvg_g_DCT2P8[64] = DEFINE_DCT2_P8_MATRIX(64, 83, 36, 89, 75, 50, 18);
 const int16_t uvg_g_DCT2P16[256] = DEFINE_DCT2_P16_MATRIX(64, 83, 36, 89, 75, 50, 18, 90, 87, 80, 70, 57, 43, 25, 9);
@@ -1020,6 +1027,68 @@ const int16_t uvg_g_DCT8P16[256] = DEFINE_DCT8_P16_MATRIX(88, 88, 87, 85, 81, 77
 const int16_t uvg_g_DCT8P32[1024] = DEFINE_DCT8_P32_MATRIX(90, 90, 89, 88, 87, 86, 85, 84, 82, 80, 78, 77, 74, 72, 68, 66, 63, 60, 56, 53, 50, 46, 42, 38, 34, 30, 26, 21, 17, 13, 9, 4);
 
 // ********************************** DCT-2 **********************************
+void fastForwardDCT2_B2(const int16_t* src, int16_t* dst, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  int32_t j;
+  int32_t E, O;
+  int32_t add = (shift > 0) ? (1 << (shift - 1)) : 0;
+
+  const int16_t* iT = uvg_g_DCT2P2;
+
+  int16_t *p_coef = dst;
+  const int  reduced_line = line - skip_line;
+  for (j = 0; j < reduced_line; j++)
+  {
+    /* E and O */
+    E = src[0] + src[1];
+    O = src[0] - src[1];
+
+    dst[0] = (iT[0] * E + add) >> shift;
+    dst[line] = (iT[2] * O + add) >> shift;
+
+
+    src += 2;
+    dst++;
+  }
+  if (skip_line)
+  {
+    dst = p_coef + reduced_line;
+    for (j = 0; j < 2; j++)
+    {
+      memset(dst, 0, sizeof(int16_t) * skip_line);
+      dst += line;
+    }
+  }
+}
+
+void fastInverseDCT2_B2(const int16_t* src, int16_t* dst, int shift, int line, int skip_line, int skip_line2)
+{
+  int32_t j;
+  int32_t E, O;
+  int32_t add = 1 << (shift - 1);
+
+  const int16_t* iT = uvg_g_DCT2P2;
+
+  const int  reduced_line = line - skip_line;
+  for (j = 0; j < reduced_line; j++)
+  {
+    /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+    E = iT[0] * (src[0] + src[line]);
+    O = iT[2] * (src[0] - src[line]);
+
+    /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
+    dst[0] = (short)CLIP(-32768, 32767, (E + add) >> shift);
+    dst[1] = (short)CLIP(-32768, 32767, (O + add) >> shift);
+
+    src++;
+    dst += 2;
+  }
+  if (skip_line)
+  {
+    memset(dst, 0, (skip_line << 1) * sizeof(int16_t));
+  }
+}
+
 static void fastForwardDCT2_B4(const int16_t* src, int16_t* dst, int32_t shift, int line, int skip_line, int skip_line2)
 {
   int32_t j;
@@ -2417,16 +2486,16 @@ DCT_MTS_NXN_GENERIC(DST1, 32);
 typedef void partial_tr_func(const int16_t*, int16_t*, int32_t, int, int, int);
 
 // ToDo: Enable MTS 2x2 and 64x64 transforms
-static partial_tr_func* dct_table[3][5] = {
-  { fastForwardDCT2_B4, fastForwardDCT2_B8, fastForwardDCT2_B16, fastForwardDCT2_B32, NULL },
-  { fastForwardDCT8_B4, fastForwardDCT8_B8, fastForwardDCT8_B16, fastForwardDCT8_B32, NULL },
-  { fastForwardDST7_B4, fastForwardDST7_B8, fastForwardDST7_B16, fastForwardDST7_B32, NULL },
+static partial_tr_func* dct_table[3][6] = {
+  { fastForwardDCT2_B2, fastForwardDCT2_B4, fastForwardDCT2_B8, fastForwardDCT2_B16, fastForwardDCT2_B32, NULL },
+  { NULL,               fastForwardDCT8_B4, fastForwardDCT8_B8, fastForwardDCT8_B16, fastForwardDCT8_B32, NULL },
+  { NULL,               fastForwardDST7_B4, fastForwardDST7_B8, fastForwardDST7_B16, fastForwardDST7_B32, NULL },
 };
 
-static partial_tr_func* idct_table[3][5] = {
-  { fastInverseDCT2_B4, fastInverseDCT2_B8, fastInverseDCT2_B16, fastInverseDCT2_B32, NULL/*fastInverseDCT2_B64*/ },
-  { fastInverseDCT8_B4, fastInverseDCT8_B8, fastInverseDCT8_B16, fastInverseDCT8_B32, NULL },
-  { fastInverseDST7_B4, fastInverseDST7_B8, fastInverseDST7_B16, fastInverseDST7_B32, NULL },
+static partial_tr_func* idct_table[3][6] = {
+  { fastInverseDCT2_B2, fastInverseDCT2_B4, fastInverseDCT2_B8, fastInverseDCT2_B16, fastInverseDCT2_B32, NULL/*fastInverseDCT2_B64*/ },
+  { NULL,               fastInverseDCT8_B4, fastInverseDCT8_B8, fastInverseDCT8_B16, fastInverseDCT8_B32, NULL },
+  { NULL,               fastInverseDST7_B4, fastInverseDST7_B8, fastInverseDST7_B16, fastInverseDST7_B32, NULL },
 };
 
 
@@ -2458,6 +2527,7 @@ void uvg_get_tr_type(
 
   if (implicit_mts)
   {
+    // ISP_TODO: do these apply for ISP blocks?
     bool width_ok = width >= 4 && width <= 16;
     bool height_ok = height >= 4 && height <= 16;
 
@@ -2506,8 +2576,10 @@ static void mts_dct_generic(
   {
     int skip_width = (type_hor != DCT2 && width == 32) ? 16 : (width > 32 ? width - 32 : 0);
     int skip_height = (type_ver != DCT2 && height == 32) ? 16 : (height > 32 ? height - 32 : 0);
-    const int log2_width_minus2 = uvg_g_convert_to_bit[width];
-    const int log2_height_minus2 = uvg_g_convert_to_bit[height];
+    const int log2_width_minus1  = uvg_g_convert_to_log2[width] - 1;
+    const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
+    //const int log2_width_minus2 = uvg_g_convert_to_bit[width];
+    //const int log2_height_minus2 = uvg_g_convert_to_bit[height];
 
     if(tu->lfnst_idx || tu->cr_lfnst_idx) {
       if ((width == 4 && height > 4) || (width > 4 && height == 4))
@@ -2522,12 +2594,12 @@ static void mts_dct_generic(
       }
     }
 
-    partial_tr_func* dct_hor = dct_table[type_hor][log2_width_minus2];
-    partial_tr_func* dct_ver = dct_table[type_ver][log2_height_minus2];
+    partial_tr_func* dct_hor = dct_table[type_hor][log2_width_minus1];
+    partial_tr_func* dct_ver = dct_table[type_ver][log2_height_minus1];
 
     int16_t tmp[32 * 32];
-    const int32_t shift_1st = log2_width_minus2 + bitdepth - 7;
-    const int32_t shift_2nd = log2_height_minus2 + 8;
+    const int32_t shift_1st = log2_width_minus1 + bitdepth - 8;
+    const int32_t shift_2nd = log2_height_minus1 + 7;
 
     dct_hor(input, tmp, shift_1st, height, 0, skip_width);
     dct_ver(tmp, output, shift_2nd, width, skip_width, skip_height);
@@ -2559,8 +2631,8 @@ static void mts_idct_generic(
   {
     int skip_width = (type_hor != DCT2 && width == 32) ? 16 : width > 32 ? width - 32 : 0;
     int skip_height = (type_ver != DCT2 && height == 32) ? 16 : height > 32 ? height - 32 : 0;
-    const int log2_width_minus2 = uvg_g_convert_to_bit[width];
-    const int log2_height_minus2 = uvg_g_convert_to_bit[height];
+    const int log2_width_minus1  = uvg_g_convert_to_log2[width] - 1;
+    const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
 
     if (tu->lfnst_idx || tu->cr_lfnst_idx) {
       if ((width == 4 && height > 4) || (width > 4 && height == 4)) {
@@ -2573,8 +2645,8 @@ static void mts_idct_generic(
       }
     }
 
-    partial_tr_func* idct_hor = idct_table[type_hor][log2_width_minus2];
-    partial_tr_func* idct_ver = idct_table[type_ver][log2_height_minus2];
+    partial_tr_func* idct_hor = idct_table[type_hor][log2_width_minus1];
+    partial_tr_func* idct_ver = idct_table[type_ver][log2_height_minus1];
 
     int16_t tmp[32 * 32];
     const int max_log2_tr_dynamic_range = 15;
diff --git a/src/strategies/generic/intra-generic.c b/src/strategies/generic/intra-generic.c
index 1891ee5a..833c42c2 100644
--- a/src/strategies/generic/intra-generic.c
+++ b/src/strategies/generic/intra-generic.c
@@ -414,7 +414,8 @@ static void uvg_intra_pred_planar_generic(
   const int offset = 1 << (log2_width + log2_height);
   const int final_shift = 1 + log2_width + log2_height;
   
-  assert((log2_width >= 2 && log2_width <= 5) && (log2_height >= 2 && log2_height <= 5));
+  // If ISP is enabled log_dim 1 is possible (limit was previously 2)
+  assert((log2_width >= 1 && log2_width <= 5) && (log2_height >= 1 && log2_height <= 5));
 
   const uvg_pixel top_right = ref_top[width + 1];
   const uvg_pixel bottom_left = ref_left[height + 1];
diff --git a/src/transform.c b/src/transform.c
index 8b903579..526e71c2 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -1353,7 +1353,9 @@ void uvg_quantize_lcu_residual(
 
   // Tell clang-analyzer what is up. For some reason it can't figure out from
   // asserting just depth.
-  assert(width ==  4 ||
+  // Width 2 is possible with ISP blocks
+  assert(width ==  2 ||
+         width ==  4 ||
          width ==  8 ||
          width == 16 ||
          width == 32 ||

From 99495c331be218a9a0cdc6d31606223df80c19f5 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Fri, 16 Sep 2022 10:37:51 +0300
Subject: [PATCH 070/254] [isp] Fix some asserts to allow log2_dim 1 block
 sizes. Fix coefficient group scan order for small dimensions.

---
 src/intra.c                            | 3 ++-
 src/strategies/generic/intra-generic.c | 3 ++-
 src/tables.c                           | 8 +++++++-
 3 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/intra.c b/src/intra.c
index 3c02dac2..b1ca6361 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -1248,7 +1248,8 @@ void uvg_intra_build_reference_inner(
   const int cu_x = cu_loc->x;
   const int cu_y = cu_loc->y;
 
-  assert((log2_width >= 2 && log2_width <= 5) && (log2_height >= 2 && log2_height <= 5));
+  // Log2_dim 1 is possible with ISP blocks
+  assert((log2_width >= 1 && log2_width <= 5) && (log2_height >= 1 && log2_height <= 5));
 
   refs->filtered_initialized = false;
   uvg_pixel * __restrict out_left_ref = &refs->ref.left[0];
diff --git a/src/strategies/generic/intra-generic.c b/src/strategies/generic/intra-generic.c
index 833c42c2..81de7c4b 100644
--- a/src/strategies/generic/intra-generic.c
+++ b/src/strategies/generic/intra-generic.c
@@ -66,7 +66,8 @@ static void uvg_angular_pred_generic(
   const int log2_width  = uvg_g_convert_to_log2[width];
   const int log2_height = uvg_g_convert_to_log2[height];
   
-  assert((log2_width >= 2 && log2_width <= 5) && (log2_height >= 2 && log2_height <= 5));
+  // Log2_dim 1 is possible with ISP blocks
+  assert((log2_width >= 1 && log2_width <= 5) && (log2_height >= 1 && log2_height <= 5));
   assert(intra_mode >= 2 && intra_mode <= 66);
 
   static const int16_t modedisp2sampledisp[32] = { 0,    1,    2,    3,    4,    6,     8,   10,   12,   14,   16,   18,   20,   23,   26,   29,   32,   35,   39,  45,  51,  57,  64,  73,  86, 102, 128, 171, 256, 341, 512, 1024 };
diff --git a/src/tables.c b/src/tables.c
index dec8b467..dec6f020 100644
--- a/src/tables.c
+++ b/src/tables.c
@@ -2615,6 +2615,12 @@ const uint32_t* const uvg_get_scan_order_table(int scan_group, int scan_type, in
     return g_scan_order[scan_group][log2_w][log2_h];
   }
   else {
-    return g_scan_order[scan_group][log2_w - 2][log2_h - 2];
+    if (log2_w == 1 || log2_h == 1) {
+      // Just return array containing [0, 15] in order
+      return g_scan_order[scan_group][0][4];
+    }
+    else {
+      return g_scan_order[scan_group][log2_w - 2][log2_h - 2];
+    }
   }
 }

From b8e36bbc4a8571bf66d190c2a63a04506e4a7e1f Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Fri, 16 Sep 2022 11:10:26 +0300
Subject: [PATCH 071/254] [isp] Fix storing cbfs for small ISP splits. Fix pdpc
 filtering. Cannot be used if width or height is less than 4. Fix dct related
 CI errors.

---
 src/cu.c                               | 36 ++++++++++++++++++++++++++
 src/cu.h                               |  1 +
 src/encode_coding_tree.c               | 15 ++++++++---
 src/intra.c                            |  1 +
 src/search.c                           |  8 ++++--
 src/strategies/avx2/intra-avx2.c       |  2 +-
 src/strategies/generic/dct-generic.c   |  4 +--
 src/strategies/generic/intra-generic.c |  2 +-
 8 files changed, 59 insertions(+), 10 deletions(-)

diff --git a/src/cu.c b/src/cu.c
index 10d99943..3a0f03fa 100644
--- a/src/cu.c
+++ b/src/cu.c
@@ -97,6 +97,42 @@ cu_info_t* uvg_cu_array_at(cu_array_t *cua, unsigned x_px, unsigned y_px)
 }
 
 
+void uvg_get_isp_cu_arr_coords(int *x, int *y)
+{
+  // Do nothing if dimensions are divisible by 4
+  if (*y % 4 == 0 && *x % 4 == 0) return;
+  const int remainder_y = *y % 4;
+  const int remainder_x = *x % 4;
+
+  if (remainder_y != 0) {
+    // Horizontal ISP split
+    if (remainder_y % 2 == 0) {
+      // 8x2 block
+      *y -= 2;
+      *x += 4;
+    }
+    else {
+      // 16x1 block
+      *y -= remainder_y;
+      *x += remainder_y * 4;
+    }
+  }
+  else {
+    // Vertical ISP split
+    if (*x % 2 == 0) {
+      // 2x8 block
+      *y += 4;
+      *x -= 2;
+    }
+    else {
+      // 1x16 block
+      *y += remainder_x * 4;
+      *x -= remainder_x;
+    }
+  }
+}
+
+
 const cu_info_t* uvg_cu_array_at_const(const cu_array_t *cua, unsigned x_px, unsigned y_px)
 {
   assert(x_px < cua->width);
diff --git a/src/cu.h b/src/cu.h
index dae446c4..de22dd89 100644
--- a/src/cu.h
+++ b/src/cu.h
@@ -249,6 +249,7 @@ typedef struct cu_array_t {
 } cu_array_t;
 
 cu_info_t* uvg_cu_array_at(cu_array_t *cua, unsigned x_px, unsigned y_px);
+void uvg_get_isp_cu_arr_coords(int* x, int* y);
 const cu_info_t* uvg_cu_array_at_const(const cu_array_t *cua, unsigned x_px, unsigned y_px);
 
 cu_array_t * uvg_cu_array_alloc(const int width, const int height);
diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index 096e4f5c..cd11ddc3 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -542,7 +542,10 @@ static void encode_transform_unit(
   const uint8_t height_c = cu_loc->chroma_height;
 
   cu_array_t* used_cu_array = tree_type != UVG_CHROMA_T ? frame->cu_array : frame->chroma_cu_array;
-  const cu_info_t *cur_pu = uvg_cu_array_at_const(used_cu_array, x, y);
+  int isp_x = x;
+  int isp_y = y;
+  uvg_get_isp_cu_arr_coords(&isp_x, &isp_y);
+  const cu_info_t *cur_pu = uvg_cu_array_at_const(used_cu_array, isp_x, isp_y);
 
   int8_t scan_idx = uvg_get_scan_order(cur_pu->type, cur_pu->intra.mode, depth);
 
@@ -627,13 +630,17 @@ static void encode_transform_coeff(
   cu_loc_t *original_loc)       // Original dimensions before ISP split
 {
   cabac_data_t * const cabac = &state->cabac;
-  const int x = cu_loc->x;
-  const int y = cu_loc->y;
+  int x = cu_loc->x;
+  int y = cu_loc->y;
   const int width = cu_loc->width;
   const int height = cu_loc->height;
 
   bool isp_split = cu_loc->x != original_loc->x || cu_loc->y != original_loc->y;
 
+  if (isp_split) {
+    uvg_get_isp_cu_arr_coords(&x, &y);
+  }
+
   //const encoder_control_t *const ctrl = state->encoder_control;
   const videoframe_t * const frame = state->tile->frame;
   const cu_array_t* used_array = tree_type != UVG_CHROMA_T ? frame->cu_array : frame->chroma_cu_array;
@@ -643,7 +650,7 @@ static void encode_transform_coeff(
   // containing CU.
   const int x_cu = 8 * (x / 8);
   const int y_cu = 8 * (y / 8);
-  const cu_info_t *cur_cu = uvg_cu_array_at_const(used_array, x, y);
+  const cu_info_t *cur_cu = uvg_cu_array_at_const(used_array, x, y); // TODO: very suspect, chroma cbfs stored in upper left corner, everything else in bottom right for depth 4
 
   // NxN signifies implicit transform split at the first transform level.
   // There is a similar implicit split for inter, but it is only used when
diff --git a/src/intra.c b/src/intra.c
index b1ca6361..2239eeac 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -971,6 +971,7 @@ static void intra_predict_regular(
   // pdpc
   // bool pdpcCondition = (mode == 0 || mode == 1 || mode == 18 || mode == 50);
   bool pdpcCondition = (mode == 0 || mode == 1); // Planar and DC
+  pdpcCondition &= width >= TR_MIN_WIDTH && height >= TR_MIN_WIDTH;
   if (pdpcCondition && multi_ref_index == 0) // Cannot be used with MRL.
   {
     uvg_pdpc_planar_dc(mode, cu_loc, color, used_ref, dst);
diff --git a/src/search.c b/src/search.c
index ff426dda..5c2f3942 100644
--- a/src/search.c
+++ b/src/search.c
@@ -1172,8 +1172,12 @@ static double search_cu(
       for (int i = 0; i < split_num; ++i) {
         cu_loc_t isp_loc;
         uvg_get_isp_split_loc(&isp_loc, x, y, cu_width, cu_height, i, split_type);
-        //search_data->best_isp_cbfs |= cbf_is_set(cur_cu->cbf, depth, COLOR_Y) << (i++);
-        cu_info_t* split_cu = LCU_GET_CU_AT_PX(lcu, isp_loc.x % LCU_WIDTH, isp_loc.y % LCU_WIDTH);
+        // Fetching from CU array does not work for dimensions less than 4
+        // Fetch proper x, y coords for isp blocks
+        int tmp_x = isp_loc.x;
+        int tmp_y = isp_loc.y;
+        uvg_get_isp_cu_arr_coords(&tmp_x, &tmp_y);
+        cu_info_t* split_cu = LCU_GET_CU_AT_PX(lcu, tmp_x % LCU_WIDTH, tmp_y % LCU_WIDTH);
         bool cur_cbf = (intra_search.best_isp_cbfs >> i) & 1;
         cbf_clear(&split_cu->cbf, depth, COLOR_Y);
         cbf_clear(&split_cu->cbf, depth, COLOR_U);
diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c
index 57fee201..6e0f10c3 100644
--- a/src/strategies/avx2/intra-avx2.c
+++ b/src/strategies/avx2/intra-avx2.c
@@ -357,7 +357,7 @@ static void uvg_angular_pred_avx2(
 
      
       // PDPC
-      bool PDPC_filter = (width >= 4 || channel_type != 0);
+      bool PDPC_filter = ((width >= TR_MIN_WIDTH && height >= TR_MIN_WIDTH) || channel_type != 0);
       if (pred_mode > 1 && pred_mode < 67) {
         if (mode_disp < 0 || multi_ref_index) { // Cannot be used with MRL.
           PDPC_filter = false;
diff --git a/src/strategies/generic/dct-generic.c b/src/strategies/generic/dct-generic.c
index 507ed174..db725359 100644
--- a/src/strategies/generic/dct-generic.c
+++ b/src/strategies/generic/dct-generic.c
@@ -1027,7 +1027,7 @@ const int16_t uvg_g_DCT8P16[256] = DEFINE_DCT8_P16_MATRIX(88, 88, 87, 85, 81, 77
 const int16_t uvg_g_DCT8P32[1024] = DEFINE_DCT8_P32_MATRIX(90, 90, 89, 88, 87, 86, 85, 84, 82, 80, 78, 77, 74, 72, 68, 66, 63, 60, 56, 53, 50, 46, 42, 38, 34, 30, 26, 21, 17, 13, 9, 4);
 
 // ********************************** DCT-2 **********************************
-void fastForwardDCT2_B2(const int16_t* src, int16_t* dst, int32_t shift, int line, int skip_line, int skip_line2)
+static void fastForwardDCT2_B2(const int16_t* src, int16_t* dst, int32_t shift, int line, int skip_line, int skip_line2)
 {
   int32_t j;
   int32_t E, O;
@@ -1061,7 +1061,7 @@ void fastForwardDCT2_B2(const int16_t* src, int16_t* dst, int32_t shift, int lin
   }
 }
 
-void fastInverseDCT2_B2(const int16_t* src, int16_t* dst, int shift, int line, int skip_line, int skip_line2)
+static void fastInverseDCT2_B2(const int16_t* src, int16_t* dst, int shift, int line, int skip_line, int skip_line2)
 {
   int32_t j;
   int32_t E, O;
diff --git a/src/strategies/generic/intra-generic.c b/src/strategies/generic/intra-generic.c
index 81de7c4b..eff47941 100644
--- a/src/strategies/generic/intra-generic.c
+++ b/src/strategies/generic/intra-generic.c
@@ -302,7 +302,7 @@ static void uvg_angular_pred_generic(
 
      
       // PDPC
-      bool PDPC_filter = (tmp_width >= 4 || channel_type != 0);
+      bool PDPC_filter = ((tmp_width >= TR_MIN_WIDTH && tmp_height >= TR_MIN_WIDTH)  || channel_type != 0);
       if (pred_mode > 1 && pred_mode < 67) {
         if (mode_disp < 0 || multi_ref_index) { // Cannot be used with MRL.
           PDPC_filter = false;

From 01c4d1ddb0b050ebb6dc5fe987bf5017c78c4830 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Tue, 20 Sep 2022 12:35:16 +0300
Subject: [PATCH 072/254] [isp] Fix cabac issues. There are always four
 transform blocks even if there are only two ISP splits. Fix prediction
 issues. PDPC filter was applied when it should be disabled. Fix reference
 building issues. Left reference was built incorrectly for blocks with height
 2.

---
 src/encode_coding_tree.c               |  4 +-
 src/intra.c                            | 74 ++++++++++++++++++--------
 src/intra.h                            |  6 +--
 src/search.c                           | 19 ++++---
 src/strategies/avx2/intra-avx2.c       |  2 +-
 src/strategies/generic/intra-generic.c |  2 +-
 src/transform.c                        |  4 +-
 7 files changed, 74 insertions(+), 37 deletions(-)

diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index cd11ddc3..79756712 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -1657,13 +1657,13 @@ void uvg_encode_coding_tree(
       // ISP split is done horizontally or vertically depending on ISP mode, 2 or 4 times depending on block dimensions.
       // Small blocks are split only twice.
       int split_type = cur_cu->intra.isp_mode;
-      int split_limit = split_type == ISP_MODE_NO_ISP ? 1 : uvg_get_isp_split_num(cu_width, cu_height, split_type);
+      int split_limit = split_type == ISP_MODE_NO_ISP ? 1 : uvg_get_isp_split_num(cu_width, cu_height, split_type, true);
       luma_cbf_ctx = split_limit != 1 ? 2 : 0;
       // If all first three splits have luma cbf 0, the last one must be one. Since the value ca be derived, no need to write it
       bool can_skip_last_cbf = true;
       for (int i = 0; i < split_limit; ++i) {
         cu_loc_t split_loc;
-        uvg_get_isp_split_loc(&split_loc, x, y, cu_width, cu_height, i, split_type);
+        uvg_get_isp_split_loc(&split_loc, x, y, cu_width, cu_height, i, split_type, true);
 
         // Check if last split to write chroma
         bool last_split = (i + 1) == split_limit;
diff --git a/src/intra.c b/src/intra.c
index 2239eeac..e6ccb77a 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -1077,6 +1077,10 @@ void uvg_intra_build_reference_any(
       }
       else {
         px_available_left = num_ref_pixels_left[lcu_px.y / 4][lcu_px.x / 4];
+        // This table does not have values for dimensions less than 4
+        if (lcu_px.y % 4 != 0) {
+          px_available_left -= 2;
+        }
       }
     }
     else {
@@ -1085,7 +1089,6 @@ void uvg_intra_build_reference_any(
 
     // Limit the number of available pixels based on block size and dimensions
     // of the picture.
-    // TODO: height for non-square blocks
     px_available_left = MIN(px_available_left, cu_height * 2 + multi_ref_index);
     px_available_left = MIN(px_available_left, (pic_px->y - luma_px->y) >> is_chroma);
 
@@ -1364,6 +1367,10 @@ void uvg_intra_build_reference_inner(
     }
     else {
       px_available_left = num_ref_pixels_left[lcu_px.y / 4][lcu_px.x / 4];
+      // This table does not have values for dimensions less than 4
+      if (lcu_px.y % 4 != 0) {
+        px_available_left -= 2;
+      }
     }
 
   }
@@ -1378,13 +1385,24 @@ void uvg_intra_build_reference_inner(
 
   // Copy pixels from coded CUs.
   int i = multi_ref_index;  // Offset by multi_ref_index
-  do {
-    out_left_ref[i + 1] = left_border[(i + 0 - multi_ref_index) * left_stride];
-    out_left_ref[i + 2] = left_border[(i + 1 - multi_ref_index) * left_stride];
-    out_left_ref[i + 3] = left_border[(i + 2 - multi_ref_index) * left_stride];
-    out_left_ref[i + 4] = left_border[(i + 3 - multi_ref_index) * left_stride];
-    i += 4;
-  } while (i < px_available_left);
+  
+  // Do different loop for heights smaller than 4 (possible for some ISP splits)
+  if (lcu_px.y % 4 != 0) {
+    do {
+      out_left_ref[i + 1] = left_border[(i + 0 - multi_ref_index) * left_stride];
+      out_left_ref[i + 2] = left_border[(i + 1 - multi_ref_index) * left_stride];
+      i += 2;
+    } while (i < px_available_left);
+  }
+  else {
+    do {
+      out_left_ref[i + 1] = left_border[(i + 0 - multi_ref_index) * left_stride];
+      out_left_ref[i + 2] = left_border[(i + 1 - multi_ref_index) * left_stride];
+      out_left_ref[i + 3] = left_border[(i + 2 - multi_ref_index) * left_stride];
+      out_left_ref[i + 4] = left_border[(i + 3 - multi_ref_index) * left_stride];
+      i += 4;
+    } while (i < px_available_left);
+  }
 
   // Extend the last pixel for the rest of the reference values.
   uvg_pixel nearest_pixel = out_left_ref[i];
@@ -1556,7 +1574,7 @@ const cu_info_t* uvg_get_co_located_luma_cu(
 * \param height       Block height.
 * \param split_type   Horizontal or vertical split.
 */
-int uvg_get_isp_split_dim(const int width, const int height, const int split_type)
+int uvg_get_isp_split_dim(const int width, const int height, const int split_type, const bool is_transform_split)
 {
   assert(split_type != ISP_MODE_NO_ISP && "Cannot calculate split dimension if no split type is set. Make sure this function is not called in this case.");
 
@@ -1576,35 +1594,45 @@ int uvg_get_isp_split_dim(const int width, const int height, const int split_typ
   const int factor_to_min_samples = non_split_dim_size < min_num_samples ? min_num_samples >> uvg_math_floor_log2(non_split_dim_size) : 1;
   partition_size = (split_dim_size >> div_shift) < factor_to_min_samples ? factor_to_min_samples : (split_dim_size >> div_shift);
 
+  // Minimum width for ISP splits are 4. (JVET-T2001 chapter 8.4.5.1 equation 246: nPbW = Max(4, nW)) 
+  // Except this does not apply for transform blocks for some reason. VTM does seem to expect 4 transform blocks even if only two pred blocks were used
+  // Height can be 2.
+  if (!divide_in_rows && !is_transform_split) {
+    partition_size = MAX(4, partition_size);
+  }
+
   assert((uvg_math_floor_log2(partition_size) + uvg_math_floor_log2(non_split_dim_size) >= uvg_math_floor_log2(min_num_samples)) &&
     "Partition has less than allowed minimum number of samples.");
   return partition_size;
 }
 
 
-int uvg_get_isp_split_num(const int width, const int height, const int split_type)
+int uvg_get_isp_split_num(const int width, const int height, const int split_type, const bool is_transform_split)
 {
   assert((split_type != ISP_MODE_NO_ISP) && "This function cannot be called if ISP mode is 0.");
-  int split_dim = uvg_get_isp_split_dim(width, height, split_type);
+  int split_dim = uvg_get_isp_split_dim(width, height, split_type, is_transform_split);
   int num = split_type == ISP_MODE_HOR ? height / split_dim : width / split_dim;
 
   return num;
 }
 
 
-void uvg_get_isp_split_loc(cu_loc_t *loc, const int x, const int y, const int block_w, const int block_h, const int split_idx, const int split_type)
+void uvg_get_isp_split_loc(cu_loc_t *loc, const int x, const int y, const int block_w, const int block_h, int split_idx, const int split_type, const bool is_transform_split)
 {
   assert((split_idx >= 0 && split_idx <= 3) && "ISP split index must be in [0, 3].");
   assert((split_type != ISP_MODE_NO_ISP || split_idx == 0) && "Trying to ISP split when split type = NO_ISP.");
   int part_dim = block_w;
   if (split_type != ISP_MODE_NO_ISP) {
-    part_dim = uvg_get_isp_split_dim(block_w, block_h, split_type);
+    part_dim = uvg_get_isp_split_dim(block_w, block_h, split_type, is_transform_split);
+  }
+  if(split_type == ISP_MODE_VER && block_w < 16 && !is_transform_split) {
+    split_idx /= 2;
   }
   const int offset = part_dim * split_idx;
 
   const int part_x = split_type == ISP_MODE_HOR ? x : x + offset;
   const int part_y = split_type == ISP_MODE_HOR ? y + offset : y;
-  const int part_w = split_type == ISP_MODE_HOR ? block_w : part_dim;
+  const int part_w = split_type == ISP_MODE_HOR ? block_w  : part_dim;
   const int part_h = split_type == ISP_MODE_HOR ? part_dim : block_h;
 
   uvg_cu_loc_ctor(loc, part_x, part_y, part_w, part_h);
@@ -1773,17 +1801,21 @@ void uvg_intra_recon_cu(
     // ISP split is done horizontally or vertically depending on ISP mode, 2 or 4 times depending on block dimensions.
     // Small blocks are split only twice.
     int split_type = search_data->pred_cu.intra.isp_mode;
-    int split_limit = uvg_get_isp_split_num(width, height, split_type);
+    int split_limit = uvg_get_isp_split_num(width, height, split_type, true);
     cu_loc_t origin_cu;
     uvg_cu_loc_ctor(&origin_cu, x, y, width, height);
 
     for (int i = 0; i < split_limit; ++i) {
-      cu_loc_t split_loc;
-      uvg_get_isp_split_loc(&split_loc, x, y, width, height, i, split_type);
+      cu_loc_t tu_loc;
+      uvg_get_isp_split_loc(&tu_loc, x, y, width, height, i, split_type, true);
+      cu_loc_t pu_loc;
+      uvg_get_isp_split_loc(&pu_loc, x, y, width, height, i, split_type, false);
 
-      intra_recon_tb_leaf(state, &split_loc, &origin_cu, lcu, COLOR_Y, search_data, tree_type);
+      if(tu_loc.x % 4 == 0) {
+        intra_recon_tb_leaf(state, &pu_loc, &origin_cu, lcu, COLOR_Y, search_data, tree_type);
+      }
       uvg_quantize_lcu_residual(state, true, false, false,
-        &split_loc, depth, cur_cu, lcu,
+        &tu_loc, depth, cur_cu, lcu,
         false, tree_type);
       search_data->best_isp_cbfs |= cbf_is_set(cur_cu->cbf, depth, COLOR_Y) << i;
     }
@@ -1855,8 +1887,8 @@ bool uvg_can_use_isp_with_lfnst(const int width, const int height, const int isp
     return false;
   }
 
-  const int tu_width = (isp_split_type == ISP_MODE_HOR) ? width : uvg_get_isp_split_dim(width, height, SPLIT_TYPE_VER);
-  const int tu_height = (isp_split_type == ISP_MODE_HOR) ? uvg_get_isp_split_dim(width, height, SPLIT_TYPE_HOR) : height;
+  const int tu_width = (isp_split_type == ISP_MODE_HOR) ? width : uvg_get_isp_split_dim(width, height, SPLIT_TYPE_VER, true);
+  const int tu_height = (isp_split_type == ISP_MODE_HOR) ? uvg_get_isp_split_dim(width, height, SPLIT_TYPE_HOR, true) : height;
 
   if (!(tu_width >= TR_MIN_WIDTH && tu_height >= TR_MIN_WIDTH))
   {
diff --git a/src/intra.h b/src/intra.h
index f324c4fa..ac076327 100644
--- a/src/intra.h
+++ b/src/intra.h
@@ -171,8 +171,8 @@ int uvg_get_mip_flag_context(int x, int y, int width, int height, const lcu_t* l
 #define SPLIT_TYPE_HOR 1
 #define SPLIT_TYPE_VER 2
 
-int uvg_get_isp_split_dim(const int width, const int height, const int split_type);
-int uvg_get_isp_split_num(const int width, const int height, const int split_type);
-void uvg_get_isp_split_loc(cu_loc_t *loc, const int x, const int y, const int block_w, const int block_h, const int split_idx, const int split_type);
+int uvg_get_isp_split_dim(const int width, const int height, const int split_type, const bool is_transform_block);
+int uvg_get_isp_split_num(const int width, const int height, const int split_type, const bool is_transform_block);
+void uvg_get_isp_split_loc(cu_loc_t *loc, const int x, const int y, const int block_w, const int block_h, int split_idx, const int split_type, const bool is_transform_block);
 bool uvg_can_use_isp(const int width, const int height, const int max_tr_size);
 bool uvg_can_use_isp_with_lfnst(const int width, const int height, const int isp_mode, const enum uvg_tree_type tree_type);
diff --git a/src/search.c b/src/search.c
index 5c2f3942..0b51412b 100644
--- a/src/search.c
+++ b/src/search.c
@@ -370,7 +370,8 @@ double uvg_cu_rd_cost_luma(const encoder_state_t *const state,
   }
   else {
     // TODO: 8x4 CUs
-    for (int i = 0; i < 4; i++) {
+    const int split_limit = uvg_get_isp_split_num(width, height, pred_cu->intra.isp_mode, true);
+    for (int i = 0; i < split_limit; i++) {
       int luma_ctx = 2;
       if (i != 3 && isp_cbf != 0x8) {
         const int flag = (isp_cbf >> i) & 1;
@@ -400,11 +401,11 @@ double uvg_cu_rd_cost_luma(const encoder_state_t *const state,
     }
     else {
       int split_type = pred_cu->intra.isp_mode;
-      int split_limit = uvg_get_isp_split_num(width, height, split_type);
+      int split_limit = uvg_get_isp_split_num(width, height, split_type, true);
 
       for (int i = 0; i < split_limit; ++i) {
         cu_loc_t split_loc;
-        uvg_get_isp_split_loc(&split_loc, x_px, y_px, width, height, i, split_type);
+        uvg_get_isp_split_loc(&split_loc, x_px, y_px, width, height, i, split_type, true);
         const int part_x = split_loc.x;
         const int part_y = split_loc.y;
 
@@ -603,7 +604,8 @@ static double cu_rd_cost_tr_split_accurate(
   }
   else {
     // TODO: 8x4 CUs
-    for (int i = 0; i < 4; i++) {
+    const int split_limit = uvg_get_isp_split_num(width, height, pred_cu->intra.isp_mode, true);
+    for (int i = 0; i < split_limit; i++) {
       int luma_ctx = 2;
       if (i != 3 && isp_cbf != 0x8) {
         const int flag = (isp_cbf >> i) & 1;
@@ -647,11 +649,11 @@ static double cu_rd_cost_tr_split_accurate(
     }
     else {
       int split_type = pred_cu->intra.isp_mode;
-      int split_limit = uvg_get_isp_split_num(width, height, split_type);
+      int split_limit = uvg_get_isp_split_num(width, height, split_type, true);
 
       for (int i = 0; i < split_limit; ++i) {
         cu_loc_t split_loc;
-        uvg_get_isp_split_loc(&split_loc, x_px, y_px, width, height, i, split_type);
+        uvg_get_isp_split_loc(&split_loc, x_px, y_px, width, height, i, split_type, true);
         const int part_x = split_loc.x;
         const int part_y = split_loc.y;
 
@@ -1164,14 +1166,14 @@ static double search_cu(
 
       // Set isp split cbfs here
       const int split_type = intra_search.pred_cu.intra.isp_mode;
-      const int split_num = split_type == ISP_MODE_NO_ISP ? 0 : uvg_get_isp_split_num(cu_width, cu_height, split_type);
+      const int split_num = split_type == ISP_MODE_NO_ISP ? 0 : uvg_get_isp_split_num(cu_width, cu_height, split_type, true);
 
       const int cbf_cb = cbf_is_set(cur_cu->cbf, depth, COLOR_U);
       const int cbf_cr = cbf_is_set(cur_cu->cbf, depth, COLOR_V);
       const int jccr = cur_cu->joint_cb_cr;
       for (int i = 0; i < split_num; ++i) {
         cu_loc_t isp_loc;
-        uvg_get_isp_split_loc(&isp_loc, x, y, cu_width, cu_height, i, split_type);
+        uvg_get_isp_split_loc(&isp_loc, x, y, cu_width, cu_height, i, split_type, true);
         // Fetching from CU array does not work for dimensions less than 4
         // Fetch proper x, y coords for isp blocks
         int tmp_x = isp_loc.x;
@@ -1179,6 +1181,7 @@ static double search_cu(
         uvg_get_isp_cu_arr_coords(&tmp_x, &tmp_y);
         cu_info_t* split_cu = LCU_GET_CU_AT_PX(lcu, tmp_x % LCU_WIDTH, tmp_y % LCU_WIDTH);
         bool cur_cbf = (intra_search.best_isp_cbfs >> i) & 1;
+        // ISP_TODO: here, cbfs are also set for chroma for all ISP splits, is this behavior wanted?
         cbf_clear(&split_cu->cbf, depth, COLOR_Y);
         cbf_clear(&split_cu->cbf, depth, COLOR_U);
         cbf_clear(&split_cu->cbf, depth, COLOR_V);
diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c
index 6e0f10c3..2783454d 100644
--- a/src/strategies/avx2/intra-avx2.c
+++ b/src/strategies/avx2/intra-avx2.c
@@ -363,7 +363,7 @@ static void uvg_angular_pred_avx2(
           PDPC_filter = false;
         }
         else if (mode_disp > 0) {
-          PDPC_filter = (scale >= 0);
+          PDPC_filter &= (scale >= 0);
         }
       }
       if(PDPC_filter) {
diff --git a/src/strategies/generic/intra-generic.c b/src/strategies/generic/intra-generic.c
index eff47941..98911e5f 100644
--- a/src/strategies/generic/intra-generic.c
+++ b/src/strategies/generic/intra-generic.c
@@ -308,7 +308,7 @@ static void uvg_angular_pred_generic(
           PDPC_filter = false;
         }
         else if (mode_disp > 0) {
-          PDPC_filter = (scale >= 0);
+          PDPC_filter &= (scale >= 0);
         }
       }
       if(PDPC_filter) {
diff --git a/src/transform.c b/src/transform.c
index 526e71c2..ee9e79ec 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -1297,6 +1297,7 @@ static void quantize_tr_residual(
     
   }
 
+  // ISP_TODO: does this cu point to correct cbf when ISP is used for small blocks?
   cbf_clear(&cur_pu->cbf, depth, color);
   if (has_coeffs) {
     for (int j = 0; j < tr_height; ++j) {
@@ -1353,7 +1354,7 @@ void uvg_quantize_lcu_residual(
 
   // Tell clang-analyzer what is up. For some reason it can't figure out from
   // asserting just depth.
-  // Width 2 is possible with ISP blocks
+  // Width 2 is possible with ISP blocks // ISP_TODO: no, they actually are not
   assert(width ==  2 ||
          width ==  4 ||
          width ==  8 ||
@@ -1363,6 +1364,7 @@ void uvg_quantize_lcu_residual(
 
   // Reset CBFs because CBFs might have been set
   // for depth earlier
+  // ISP_TODO: does this cur_cu point to the correct place when ISP is used for small blocks?
   if (luma) {
     cbf_clear(&cur_pu->cbf, depth, COLOR_Y);
   }

From 7282534879634d4fc50cc975ed853aee5c190da7 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Tue, 20 Sep 2022 16:15:23 +0300
Subject: [PATCH 073/254] [isp] Fix CI errors.

---
 src/search_intra.c | 2 +-
 src/search_intra.h | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/search_intra.c b/src/search_intra.c
index cb485414..ddc693f7 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -373,7 +373,7 @@ static double search_intra_trdepth(
       for (trafo = mts_start; trafo < num_transforms; trafo++) {
         pred_cu->tr_idx = trafo;
         pred_cu->tr_skip = trafo == MTS_SKIP;
-        bool constraints[2] = { false, false};
+        bool constraints[2] = {false, false};
         if (mts_enabled) {
           pred_cu->mts_last_scan_pos = 0;
           pred_cu->violates_mts_coeff_constraint = 0;
diff --git a/src/search_intra.h b/src/search_intra.h
index 307b5ad9..36470e63 100644
--- a/src/search_intra.h
+++ b/src/search_intra.h
@@ -66,6 +66,4 @@ void uvg_search_cu_intra(
   lcu_t *lcu,
   enum uvg_tree_type tree_type);
 
-int uvg_get_isp_split_dim(const int width, const int height, const int split_type);
-
 #endif // SEARCH_INTRA_H_

From 5713fbff1a4cea0e0e65a62428268d0d8ecc49d9 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Tue, 20 Sep 2022 17:44:10 +0300
Subject: [PATCH 074/254] [isp] Add ISP checks to search. LFNST can be used
 with ISP for larger blocks. Transform skip cannot be used with ISP.

---
 src/search_intra.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/src/search_intra.c b/src/search_intra.c
index ddc693f7..971f9e88 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -333,7 +333,9 @@ static double search_intra_trdepth(
     }
     const int mts_start = trafo;
     //TODO: height
-    if (state->encoder_control->cfg.trskip_enable && width <= (1 << state->encoder_control->cfg.trskip_max_size) /*&& height == 4*/) {
+    if (state->encoder_control->cfg.trskip_enable && 
+        width <= (1 << state->encoder_control->cfg.trskip_max_size) /*&& height == 4*/ && 
+        pred_cu->intra.isp_mode == ISP_MODE_NO_ISP) { // tr_skip cannot be used wit ISP
       num_transforms = MAX(num_transforms, 2);
     }
     pred_cu->intra.mode_chroma = -1;
@@ -349,10 +351,9 @@ static double search_intra_trdepth(
     }
 
     int start_idx = 0;
-    int end_idx = state->encoder_control->cfg.lfnst && depth == pred_cu->
-                  tr_depth ?
-                    max_lfnst_idx :
-                    0;
+    int end_idx = state->encoder_control->cfg.lfnst && 
+                  depth == pred_cu->tr_depth &&
+                  uvg_can_use_isp_with_lfnst(width, height, pred_cu->intra.isp_mode, tree_type) ? max_lfnst_idx : 0;
     for (int i = start_idx; i < end_idx + 1; ++i) {
       search_data->lfnst_costs[i] = MAX_DOUBLE;
     }
@@ -365,10 +366,10 @@ static double search_intra_trdepth(
       pred_cu->violates_lfnst_constrained_chroma = false;
       pred_cu->lfnst_last_scan_pos = false;
 
-      if (pred_cu->lfnst_idx != 0) {
-        // Cannot use ISP with LFNST for small blocks
-        pred_cu->intra.isp_mode = uvg_can_use_isp_with_lfnst(width, height, pred_cu->intra.isp_mode, tree_type) ? pred_cu->intra.isp_mode : ISP_MODE_NO_ISP;
-      }
+      //if (pred_cu->lfnst_idx != 0) {
+      //  // Cannot use ISP with LFNST for small blocks
+      //  pred_cu->intra.isp_mode = uvg_can_use_isp_with_lfnst(width, height, pred_cu->intra.isp_mode, tree_type) ? pred_cu->intra.isp_mode : ISP_MODE_NO_ISP;
+      //}
 
       for (trafo = mts_start; trafo < num_transforms; trafo++) {
         pred_cu->tr_idx = trafo;

From 8ae285322ef62162405e6dd5f3a6463fe98c834f Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Tue, 20 Sep 2022 17:48:38 +0300
Subject: [PATCH 075/254] [isp] Add CI tests.

---
 tests/test_intra.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/test_intra.sh b/tests/test_intra.sh
index 3c37f82b..f176db69 100755
--- a/tests/test_intra.sh
+++ b/tests/test_intra.sh
@@ -19,3 +19,5 @@ valgrind_test $common_args --jccr --rdoq --rd=2 --mts=intra
 valgrind_test $common_args --rd=3 --cclm --jccr
 valgrind_test $common_args --lfnst
 valgrind_test $common_args --lfnst --rd=3 --cclm --mip --dual-tree --fast-residual-cost 0
+valgrind_test $common_args --rd=2 --isp --cpuid=0
+valgrind_test $common_args --rd=2 --isp --cpuid=0 --lfnst --mts=intra

From c4bc2d6b105e47390d33b236154ed191634fa671 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Wed, 21 Sep 2022 12:55:07 +0300
Subject: [PATCH 076/254] [isp] Limit ISP search to block size 32. Size 64 is
 not allowed.

---
 src/search_intra.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/search_intra.c b/src/search_intra.c
index 971f9e88..50baded9 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -1392,7 +1392,7 @@ static int8_t search_intra_rdo(
     double best_isp_cost = MAX_DOUBLE;
     double best_bits = MAX_DOUBLE;
     int8_t best_isp_mode = -1;
-    int max_isp_modes = can_do_isp_search && uvg_can_use_isp(width, height, 64 /*MAX_TR_SIZE*/) && state->encoder_control->cfg.isp ? NUM_ISP_MODES : 1;
+    int max_isp_modes = can_do_isp_search && uvg_can_use_isp(width, height, TR_MAX_WIDTH) && state->encoder_control->cfg.isp ? NUM_ISP_MODES : 1;
 
     for (int isp_mode = 0; isp_mode < max_isp_modes; ++isp_mode) {
       search_data[mode].pred_cu.intra.isp_mode = isp_mode;

From 89db34d4e07999913b82ac4a922ebebc1bdf4e31 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Wed, 21 Sep 2022 16:04:52 +0300
Subject: [PATCH 077/254] [isp] Use TR_MAX_WIDTH in ISP checks instead of
 parameter.

---
 src/encode_coding_tree.c | 2 +-
 src/intra.c              | 4 ++--
 src/intra.h              | 2 +-
 src/search_intra.c       | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index 79756712..d0dab272 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -1103,7 +1103,7 @@ void uvg_encode_intra_luma_coding_unit(const encoder_state_t * const state,
 
   bool enable_isp = state->encoder_control->cfg.isp;
   // Need at least 16 samples in sub blocks to use isp. If both dimensions are 4, not enough samples. Blocks of size 2 do not exist yet (not for luma at least)
-  bool allow_isp = enable_isp ? uvg_can_use_isp(width, height, 64 /*MAX_TR_SIZE*/) : false;
+  bool allow_isp = enable_isp ? uvg_can_use_isp(width, height) : false;
   uint8_t isp_mode = allow_isp ? cur_cu->intra.isp_mode : 0;
 
   if (allow_isp && !multi_ref_idx /*&& !bdpcm && !color_transform*/) {
diff --git a/src/intra.c b/src/intra.c
index e6ccb77a..7eb07911 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -1851,7 +1851,7 @@ void uvg_intra_recon_cu(
 * \param height       Block height.
 * \param max_tr_size  Maximum supported transform block size (64).
 */
-bool uvg_can_use_isp(const int width, const int height, const int max_tr_size)
+bool uvg_can_use_isp(const int width, const int height)
 {
   assert(!(width > LCU_WIDTH || height > LCU_WIDTH) && "Block size larger than max LCU size.");
   assert(!(width < TR_MIN_WIDTH || height < TR_MIN_WIDTH) && "Block size smaller than min TR_WIDTH.");
@@ -1861,7 +1861,7 @@ bool uvg_can_use_isp(const int width, const int height, const int max_tr_size)
 
   // Each split block must have at least 16 samples.
   bool not_enough_samples = (log2_width + log2_height <= 4);
-  bool cu_size_larger_than_max_tr_size = width > max_tr_size || height > max_tr_size;
+  bool cu_size_larger_than_max_tr_size = width > TR_MAX_WIDTH || height > TR_MAX_WIDTH;
   if (not_enough_samples || cu_size_larger_than_max_tr_size) {
     return false;
   }
diff --git a/src/intra.h b/src/intra.h
index ac076327..c4bdc87e 100644
--- a/src/intra.h
+++ b/src/intra.h
@@ -174,5 +174,5 @@ int uvg_get_mip_flag_context(int x, int y, int width, int height, const lcu_t* l
 int uvg_get_isp_split_dim(const int width, const int height, const int split_type, const bool is_transform_block);
 int uvg_get_isp_split_num(const int width, const int height, const int split_type, const bool is_transform_block);
 void uvg_get_isp_split_loc(cu_loc_t *loc, const int x, const int y, const int block_w, const int block_h, int split_idx, const int split_type, const bool is_transform_block);
-bool uvg_can_use_isp(const int width, const int height, const int max_tr_size);
+bool uvg_can_use_isp(const int width, const int height);
 bool uvg_can_use_isp_with_lfnst(const int width, const int height, const int isp_mode, const enum uvg_tree_type tree_type);
diff --git a/src/search_intra.c b/src/search_intra.c
index 50baded9..224482dc 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -1392,7 +1392,7 @@ static int8_t search_intra_rdo(
     double best_isp_cost = MAX_DOUBLE;
     double best_bits = MAX_DOUBLE;
     int8_t best_isp_mode = -1;
-    int max_isp_modes = can_do_isp_search && uvg_can_use_isp(width, height, TR_MAX_WIDTH) && state->encoder_control->cfg.isp ? NUM_ISP_MODES : 1;
+    int max_isp_modes = can_do_isp_search && uvg_can_use_isp(width, height) && state->encoder_control->cfg.isp ? NUM_ISP_MODES : 1;
 
     for (int isp_mode = 0; isp_mode < max_isp_modes; ++isp_mode) {
       search_data[mode].pred_cu.intra.isp_mode = isp_mode;

From eb6771321f5326f27e3990fdbb9a30501a27983b Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Thu, 22 Sep 2022 15:07:16 +0300
Subject: [PATCH 078/254] [isp] Disable fast residual cost calculation from ISP
 related CI tests.

---
 tests/test_intra.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_intra.sh b/tests/test_intra.sh
index f176db69..ea58b415 100755
--- a/tests/test_intra.sh
+++ b/tests/test_intra.sh
@@ -19,5 +19,5 @@ valgrind_test $common_args --jccr --rdoq --rd=2 --mts=intra
 valgrind_test $common_args --rd=3 --cclm --jccr
 valgrind_test $common_args --lfnst
 valgrind_test $common_args --lfnst --rd=3 --cclm --mip --dual-tree --fast-residual-cost 0
-valgrind_test $common_args --rd=2 --isp --cpuid=0
-valgrind_test $common_args --rd=2 --isp --cpuid=0 --lfnst --mts=intra
+valgrind_test $common_args --rd=2 --isp --cpuid=0 --fast-residual-cost 0
+valgrind_test $common_args --rd=2 --isp --cpuid=0 --lfnst --mts=intra --fast-residual-cost 0

From 701257cdd22e2456c04211d932a543e7458bab0c Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Thu, 22 Sep 2022 16:49:09 +0300
Subject: [PATCH 079/254] [isp] Remove unnecessary code from forward dct 32.

---
 src/strategies/generic/dct-generic.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/strategies/generic/dct-generic.c b/src/strategies/generic/dct-generic.c
index db725359..1e7c796c 100644
--- a/src/strategies/generic/dct-generic.c
+++ b/src/strategies/generic/dct-generic.c
@@ -1435,11 +1435,6 @@ static void fastForwardDCT2_B32(const int16_t* src, int16_t* dst, int32_t shift,
       dst += line;
     }
   }
-  if (skip_line2) {
-    const int  reduced_line = line - skip_line2;
-    dst = p_coef + reduced_line * 32;
-    memset(dst, 0, skip_line2 * 32 * sizeof(coeff_t));
-  }
 }
 
 static void fastInverseDCT2_B32(const int16_t* src, int16_t* dst, int32_t shift, int line, int skip_line, int skip_line2)

From b9822398a005072546718c695d9c15ce144e9506 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Fri, 23 Sep 2022 15:41:50 +0300
Subject: [PATCH 080/254] [isp] Fix lfnst constraint checks when ISP is in use.
 Add some asserts.

---
 src/encode_coding_tree.c | 27 ++++++++++++++++++++++++++-
 src/intra.c              |  3 +++
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index d0dab272..c2be1395 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -133,10 +133,35 @@ bool uvg_is_lfnst_allowed(
     }
     bool luma_flag = (depth == 4 && color == COLOR_Y) || (tree_type != UVG_CHROMA_T && depth != 4);
     bool chroma_flag = (depth == 4 && color != COLOR_Y) || tree_type != UVG_LUMA_T;
-    bool non_zero_coeff_non_ts_corner_8x8 = (luma_flag && pred_cu->violates_lfnst_constrained_luma) || (chroma_flag && pred_cu->violates_lfnst_constrained_chroma);
+    bool non_zero_coeff_non_ts_corner_8x8 = false;
+    bool last_scan_pos = false;
     bool is_tr_skip = false;
 
+    int split_num = color == COLOR_Y && isp_mode ? uvg_get_isp_split_num(width, height, isp_mode, false) : 0;
     const videoframe_t* const frame = state->tile->frame;
+    
+    if (split_num) {
+      // Constraints for ISP split blocks
+      for (int i = 0; i < split_num; ++i) {
+        cu_loc_t split_loc;
+        uvg_get_isp_split_loc(&split_loc, x, y, width, height, i, isp_mode, false);
+        int local_split_x = split_loc.x;
+        int local_split_y = split_loc.y;
+        uvg_get_isp_cu_arr_coords(&local_split_x, &local_split_y);
+        cu_info_t* split_cu = lcu ? LCU_GET_CU_AT_PX(lcu, local_split_x, local_split_y) : 
+                                    uvg_cu_array_at_const(frame->cu_array, local_split_x, local_split_y);
+
+        if (cbf_is_set(split_cu->cbf, depth, COLOR_Y)) {
+          non_zero_coeff_non_ts_corner_8x8 |= (luma_flag && split_cu->violates_lfnst_constrained_luma) || (chroma_flag && split_cu->violates_lfnst_constrained_chroma);
+          //last_scan_pos |= split_cu->lfnst_last_scan_pos;
+          last_scan_pos |= true;
+        }
+      }
+    } else {
+      non_zero_coeff_non_ts_corner_8x8 |= (luma_flag && pred_cu->violates_lfnst_constrained_luma) || (chroma_flag && pred_cu->violates_lfnst_constrained_chroma);
+      last_scan_pos |= pred_cu->lfnst_last_scan_pos;
+    }
+
     //const int num_pred_units = kvz_part_mode_num_parts[pred_cu->part_size];
     const int tr_depth = pred_cu->tr_depth;
     assert(depth <= tr_depth && "Depth greater than transform depth. This should never trigger.");
diff --git a/src/intra.c b/src/intra.c
index 7eb07911..a831f768 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -1619,6 +1619,9 @@ int uvg_get_isp_split_num(const int width, const int height, const int split_typ
 
 void uvg_get_isp_split_loc(cu_loc_t *loc, const int x, const int y, const int block_w, const int block_h, int split_idx, const int split_type, const bool is_transform_split)
 {
+  // Check for illegal splits
+  assert(!(block_w == 4 && block_h == 4) || split_idx == 0 && "Trying to get ISP split CU when split is not allowed.");
+  assert(!((block_w * block_h) <= 16) || split_idx < 2 && "Split index for small blocks must be in [0, 1]");
   assert((split_idx >= 0 && split_idx <= 3) && "ISP split index must be in [0, 3].");
   assert((split_type != ISP_MODE_NO_ISP || split_idx == 0) && "Trying to ISP split when split type = NO_ISP.");
   int part_dim = block_w;

From 85f6b0039449ef62810b411e56cc1cd83be09dd8 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Mon, 26 Sep 2022 14:44:51 +0300
Subject: [PATCH 081/254] [isp] Add lfnst asserts. Fix error in MTS search. Fix
 chroma lfnst index when no coefficients present.

---
 src/encode_coding_tree.c | 6 ++++++
 src/search_intra.c       | 8 +++-----
 src/transform.c          | 1 +
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index c2be1395..a739cebf 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -230,6 +230,12 @@ static bool encode_lfnst_idx(
     return true;
   }
   else {
+    if(depth != 4 || color == COLOR_Y) {
+      assert(pred_cu->lfnst_idx == 0);
+    }
+    if(depth == 4 && color != COLOR_Y) {
+      assert(pred_cu->cr_lfnst_idx == 0);
+    }
     return false;
   }
 }
diff --git a/src/search_intra.c b/src/search_intra.c
index 224482dc..03609552 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -343,9 +343,7 @@ static double search_intra_trdepth(
     
     const int max_tb_size = TR_MAX_WIDTH;
     // LFNST search params
-    int max_lfnst_idx = width > max_tb_size || height > max_tb_size ?
-                                0 :
-                                2;
+    int max_lfnst_idx = width > max_tb_size || height > max_tb_size ? 0 : 2;
     if(pred_cu->intra.mip_flag && (width < 16 || height < 16)) {
       max_lfnst_idx = 0;
     }
@@ -379,8 +377,8 @@ static double search_intra_trdepth(
           pred_cu->mts_last_scan_pos = 0;
           pred_cu->violates_mts_coeff_constraint = 0;
 
-          if ((trafo == MTS_SKIP && width > (1 << state->encoder_control->cfg.trskip_max_size))
-            || !state->encoder_control->cfg.trskip_enable) {
+          if (trafo == MTS_SKIP && (width > (1 << state->encoder_control->cfg.trskip_max_size)
+            || !state->encoder_control->cfg.trskip_enable)) {
             continue;
           }
         }
diff --git a/src/transform.c b/src/transform.c
index ee9e79ec..954d7836 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -580,6 +580,7 @@ void uvg_chroma_transform_search(
       &u_has_coeffs,
       &v_has_coeffs,
       pred_cu->cr_lfnst_idx);
+      if(pred_cu->cr_lfnst_idx !=0 && !u_has_coeffs && !v_has_coeffs) continue;
     
     if(pred_cu->type == CU_INTRA && transforms[i] != CHROMA_TS && (depth == 4 || tree_type == UVG_CHROMA_T)) {
       bool constraints[2] = { false, false };

From b4cc321349d7f64b226813ec514314fedc85df82 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Mon, 26 Sep 2022 17:52:34 +0300
Subject: [PATCH 082/254] [isp] Fix transform selection when MTS & ISP is used.
 Wrong transform was selected. Change mts parameter name to better reflect its
 purpose.

---
 src/strategies/avx2/dct-avx2.c       | 10 +++++-----
 src/strategies/generic/dct-generic.c | 28 +++++++++++++++++++---------
 src/strategies/strategies-dct.c      |  4 ++--
 src/strategies/strategies-dct.h      |  4 ++--
 4 files changed, 28 insertions(+), 18 deletions(-)

diff --git a/src/strategies/avx2/dct-avx2.c b/src/strategies/avx2/dct-avx2.c
index 4197f17a..04e92a7f 100644
--- a/src/strategies/avx2/dct-avx2.c
+++ b/src/strategies/avx2/dct-avx2.c
@@ -1584,7 +1584,7 @@ extern void uvg_get_tr_type(
   const cu_info_t* tu,
   tr_type_t* hor_out,
   tr_type_t* ver_out,
-  const int8_t mts_idx);
+  const int8_t mts_type);
 
 static void mts_dct_avx2(
   const int8_t bitdepth,
@@ -1594,12 +1594,12 @@ static void mts_dct_avx2(
   const int8_t height,
   const int16_t* input,
   int16_t* output,
-  const int8_t mts_idx)
+  const int8_t mts_type)
 {
   tr_type_t type_hor;
   tr_type_t type_ver;
 
-  uvg_get_tr_type(width, height, color, tu, &type_hor, &type_ver, mts_idx);
+  uvg_get_tr_type(width, height, color, tu, &type_hor, &type_ver, mts_type);
 
   if (type_hor == DCT2 && type_ver == DCT2 && !tu->lfnst_idx && width == height)
   {
@@ -1625,12 +1625,12 @@ static void mts_idct_avx2(
   const int8_t height,
   const int16_t* input,
   int16_t* output,
-  const int8_t mts_idx)
+  const int8_t mts_type)
 {
   tr_type_t type_hor;
   tr_type_t type_ver;
 
-  uvg_get_tr_type(width, height, color, tu, &type_hor, &type_ver, mts_idx);
+  uvg_get_tr_type(width, height, color, tu, &type_hor, &type_ver, mts_type);
 
   if (type_hor == DCT2 && type_ver == DCT2 && width == height)
   {
diff --git a/src/strategies/generic/dct-generic.c b/src/strategies/generic/dct-generic.c
index 1e7c796c..72cd1fb1 100644
--- a/src/strategies/generic/dct-generic.c
+++ b/src/strategies/generic/dct-generic.c
@@ -2505,7 +2505,7 @@ void uvg_get_tr_type(
   const cu_info_t* tu,
   tr_type_t* hor_out,
   tr_type_t* ver_out,
-  const int8_t mts_idx)
+  const int8_t mts_type)
 {
   *hor_out = DCT2;
   *ver_out = DCT2;
@@ -2515,14 +2515,20 @@ void uvg_get_tr_type(
     return;
   }
 
-  const bool explicit_mts = mts_idx == UVG_MTS_BOTH || (tu->type == CU_INTRA ? mts_idx == UVG_MTS_INTRA : (mts_idx == UVG_MTS_INTER && tu->type == CU_INTER));
-  const bool implicit_mts = tu->type == CU_INTRA && (mts_idx == UVG_MTS_IMPLICIT || mts_idx == UVG_MTS_INTER);
+  const bool explicit_mts = mts_type == UVG_MTS_BOTH || (tu->type == CU_INTRA ? mts_type == UVG_MTS_INTRA : (mts_type == UVG_MTS_INTER && tu->type == CU_INTER));
+  const bool implicit_mts = tu->type == CU_INTRA && (mts_type == UVG_MTS_IMPLICIT || mts_type == UVG_MTS_INTER);
 
   assert(!(explicit_mts && implicit_mts));
+  const bool is_isp = tu->type == CU_INTRA && tu->intra.isp_mode && color == COLOR_Y ? tu->intra.isp_mode : 0;
+  const int8_t lfnst_idx = color == COLOR_Y ? tu->lfnst_idx : tu->cr_lfnst_idx;
+  // const bool is_sbt = cu->type == CU_INTER && tu->sbt && color == COLOR_Y; // TODO: check SBT here when implemented
 
-  if (implicit_mts)
+  if (is_isp && lfnst_idx) {
+    return;
+  }
+
+  if (implicit_mts || (is_isp && explicit_mts))
   {
-    // ISP_TODO: do these apply for ISP blocks?
     bool width_ok = width >= 4 && width <= 16;
     bool height_ok = height >= 4 && height <= 16;
 
@@ -2537,6 +2543,10 @@ void uvg_get_tr_type(
     return;
   }
 
+  /*
+  TODO: SBT HANDLING
+  */
+
   if (explicit_mts)
   {
     if (tu->tr_idx > MTS_SKIP) {
@@ -2555,12 +2565,12 @@ static void mts_dct_generic(
   const int8_t height,
   const int16_t* input,
   int16_t* output,
-  const int8_t mts_idx)
+  const int8_t mts_type)
 {
   tr_type_t type_hor;
   tr_type_t type_ver;
 
-  uvg_get_tr_type(width, height, color, tu, &type_hor, &type_ver, mts_idx);
+  uvg_get_tr_type(width, height, color, tu, &type_hor, &type_ver, mts_type);
 
   if (type_hor == DCT2 && type_ver == DCT2 && !tu->lfnst_idx && !tu->cr_lfnst_idx && width == height)
   {
@@ -2610,12 +2620,12 @@ static void mts_idct_generic(
   const int8_t height,
   const int16_t* input,
   int16_t* output,
-  const int8_t mts_idx)
+  const int8_t mts_type)
 {
   tr_type_t type_hor;
   tr_type_t type_ver;
 
-  uvg_get_tr_type(width, height, color, tu, &type_hor, &type_ver, mts_idx);
+  uvg_get_tr_type(width, height, color, tu, &type_hor, &type_ver, mts_type);
 
   if (type_hor == DCT2 && type_ver == DCT2 && !tu->lfnst_idx && !tu->cr_lfnst_idx && width == height)
   {
diff --git a/src/strategies/strategies-dct.c b/src/strategies/strategies-dct.c
index 8441dfdd..64b72eb9 100644
--- a/src/strategies/strategies-dct.c
+++ b/src/strategies/strategies-dct.c
@@ -60,7 +60,7 @@ void(*uvg_mts_dct)(int8_t bitdepth,
   int8_t height,
   const int16_t *input,
   int16_t *output,
-  const int8_t mts_idx);
+  const int8_t mts_type);
 
 void(*uvg_mts_idct)(int8_t bitdepth,
   color_t color,
@@ -69,7 +69,7 @@ void(*uvg_mts_idct)(int8_t bitdepth,
   int8_t height,
   const int16_t *input,
   int16_t *output,
-  const int8_t mts_idx);
+  const int8_t mts_type);
 
 
 int uvg_strategy_register_dct(void* opaque, uint8_t bitdepth) {
diff --git a/src/strategies/strategies-dct.h b/src/strategies/strategies-dct.h
index b883b3e5..0ad3c8c4 100644
--- a/src/strategies/strategies-dct.h
+++ b/src/strategies/strategies-dct.h
@@ -68,7 +68,7 @@ typedef void (mts_dct_func)(
   int8_t height,
   const int16_t* input,
   int16_t* output,
-  const int8_t mts_idx);
+  const int8_t mts_type);
 
 extern mts_dct_func* uvg_mts_dct;
 
@@ -80,7 +80,7 @@ typedef void (mts_idct_func)(
   int8_t height,
   const int16_t* input,
   int16_t* output,
-  const int8_t mts_idx);
+  const int8_t mts_type);
 
 extern mts_idct_func* uvg_mts_idct;
 

From 3c861e4c0282dcbdce0148d8e3d2336646551d55 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Tue, 27 Sep 2022 14:10:03 +0300
Subject: [PATCH 083/254] [isp] Fix search. Best LFNST and MTS modes were not
 selected correctly for ISP modes.

---
 src/search_intra.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/search_intra.c b/src/search_intra.c
index 03609552..67424bbf 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -1392,7 +1392,12 @@ static int8_t search_intra_rdo(
     int8_t best_isp_mode = -1;
     int max_isp_modes = can_do_isp_search && uvg_can_use_isp(width, height) && state->encoder_control->cfg.isp ? NUM_ISP_MODES : 1;
 
+    //
+    int best_mts_mode_for_isp[NUM_ISP_MODES] = {0};
+    int best_lfnst_mode_for_isp[NUM_ISP_MODES] = {0};
     for (int isp_mode = 0; isp_mode < max_isp_modes; ++isp_mode) {
+      
+
       search_data[mode].pred_cu.intra.isp_mode = isp_mode;
       double rdo_bitcost = uvg_luma_mode_bits(state, &search_data[mode].pred_cu, x_px, y_px, depth, lcu);
       search_data[mode].pred_cu.tr_idx = MTS_TR_NUM;
@@ -1400,6 +1405,8 @@ static int8_t search_intra_rdo(
       search_data[mode].cost = rdo_bitcost * state->lambda;
 
       double mode_cost = search_intra_trdepth(state, x_px, y_px, depth, tr_depth, MAX_INT, &search_data[mode], lcu, tree_type);
+      best_mts_mode_for_isp[isp_mode] = search_data[mode].pred_cu.tr_idx;
+      best_lfnst_mode_for_isp[isp_mode] = search_data[mode].pred_cu.lfnst_idx;
       search_data[mode].cost += mode_cost;
       if (search_data[mode].cost < best_isp_cost) {
         best_isp_cost = search_data[mode].cost;
@@ -1414,6 +1421,8 @@ static int8_t search_intra_rdo(
     search_data[mode].cost = best_isp_cost;
     search_data[mode].bits = best_bits;
     search_data[mode].pred_cu.intra.isp_mode = best_isp_mode;
+    search_data[mode].pred_cu.tr_idx = best_mts_mode_for_isp[best_isp_mode];
+    search_data[mode].pred_cu.lfnst_idx = best_lfnst_mode_for_isp[best_isp_mode];
   }
 
   // Update order according to new costs

From 7005d222d5865750248183df042c4543ceb11f6e Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Wed, 28 Sep 2022 12:38:02 +0300
Subject: [PATCH 084/254] [isp] Fix lfnst constraint check when ISP is used.
 Remove some obsolete comments.

---
 src/cu.h                 | 4 ++--
 src/encode_coding_tree.c | 4 +++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/cu.h b/src/cu.h
index de22dd89..ecb7c695 100644
--- a/src/cu.h
+++ b/src/cu.h
@@ -172,8 +172,8 @@ typedef struct
   uint8_t violates_mts_coeff_constraint : 1;
   uint8_t mts_last_scan_pos : 1;
 
-  uint8_t violates_lfnst_constrained_luma : 1; // Two types, luma and chroma. Luma index is 0.
-  uint8_t violates_lfnst_constrained_chroma : 1; // Two types, luma and chroma. Luma index is 0.
+  uint8_t violates_lfnst_constrained_luma : 1;
+  uint8_t violates_lfnst_constrained_chroma : 1;
   uint8_t lfnst_last_scan_pos : 1;
   uint8_t lfnst_idx : 2;
   uint8_t cr_lfnst_idx : 2;
diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index a739cebf..f5531d63 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -151,7 +151,9 @@ bool uvg_is_lfnst_allowed(
         cu_info_t* split_cu = lcu ? LCU_GET_CU_AT_PX(lcu, local_split_x, local_split_y) : 
                                     uvg_cu_array_at_const(frame->cu_array, local_split_x, local_split_y);
 
-        if (cbf_is_set(split_cu->cbf, depth, COLOR_Y)) {
+        //if (cbf_is_set(split_cu->cbf, depth, COLOR_Y)) {
+        // ISP_TODO: remove this if clause altogether if it seems it is not needed
+        if (true) {
           non_zero_coeff_non_ts_corner_8x8 |= (luma_flag && split_cu->violates_lfnst_constrained_luma) || (chroma_flag && split_cu->violates_lfnst_constrained_chroma);
           //last_scan_pos |= split_cu->lfnst_last_scan_pos;
           last_scan_pos |= true;

From 90e2a177596a318a0e6972baf700c908047380bc Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Thu, 29 Sep 2022 15:41:33 +0300
Subject: [PATCH 085/254] [lfnst] Fix LFNST error when MIP enabled.

---
 src/transform.c | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/src/transform.c b/src/transform.c
index 954d7836..b260eea1 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -812,6 +812,26 @@ static inline bool get_transpose_flag(const int8_t intra_mode)
          ((intra_mode < NUM_LUMA_MODE) && (intra_mode > DIA_IDX));
 }
 
+
+static inline bool block_is_mip(const cu_info_t * const cur_cu, const color_t color, const bool is_sep_tree)
+{
+  if (cur_cu->type == CU_INTRA) {
+    if (color == COLOR_Y) {
+      return cur_cu->intra.mip_flag;
+    }
+    else {
+      // MIP_TODO: currently, only chroma 420 is supported. Therefore this will always return false
+
+      //bool derived_mode = cur_cu->intra.mode_chroma == (!cur_cu->intra.mip_flag ? cur_cu->intra.mode : 0);
+      //bool is_chroma_mip = !is_sep_tree /*&& chroma_format == CHROMA_444*/ && cur_cu->intra.mip_flag;
+      //return is_chroma_mip && derived_mode;
+
+      return false;
+    }
+  }
+  return false;
+}
+
 void uvg_fwd_lfnst(
   const cu_info_t* const cur_cu,
   const int width,
@@ -828,7 +848,7 @@ void uvg_fwd_lfnst(
   bool is_separate_tree = depth == 4 || tree_type != UVG_BOTH_T; 
   bool is_cclm_mode = (intra_mode >= 81 && intra_mode <= 83); // CCLM modes are in [81, 83]
 
-  bool is_mip = cur_cu->type == CU_INTRA ? cur_cu->intra.mip_flag : false;
+  bool is_mip = block_is_mip(cur_cu, color, is_separate_tree);
   bool is_wide_angle = false; // TODO: get wide angle mode when implemented
 
   const int cu_type = cur_cu->type;
@@ -965,7 +985,7 @@ void uvg_inv_lfnst(
   bool is_separate_tree = depth == 4 || tree_type != UVG_BOTH_T;
   bool is_cclm_mode = (intra_mode >= 81 && intra_mode <= 83); // CCLM modes are in [81, 83]
 
-  bool is_mip = cur_cu->type == CU_INTRA && tree_type != UVG_CHROMA_T ? cur_cu->intra.mip_flag : false;
+  bool is_mip = block_is_mip(cur_cu, color, is_separate_tree);
   bool is_wide_angle = false; // TODO: get wide angle mode when implemented
 
   const int cu_type = cur_cu->type;

From 95d73116f962b1166774517f72c3c50c5372af75 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Mon, 3 Oct 2022 13:13:56 +0300
Subject: [PATCH 086/254] [isp] Fix some CI errors. Some const modifiers were
 discarded.

---
 src/encode_coding_tree.c | 4 ++--
 src/rdo.c                | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index f5531d63..46552a12 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -148,7 +148,7 @@ bool uvg_is_lfnst_allowed(
         int local_split_x = split_loc.x;
         int local_split_y = split_loc.y;
         uvg_get_isp_cu_arr_coords(&local_split_x, &local_split_y);
-        cu_info_t* split_cu = lcu ? LCU_GET_CU_AT_PX(lcu, local_split_x, local_split_y) : 
+        const cu_info_t* split_cu = lcu ? LCU_GET_CU_AT_PX(lcu, local_split_x, local_split_y) : 
                                     uvg_cu_array_at_const(frame->cu_array, local_split_x, local_split_y);
 
         //if (cbf_is_set(split_cu->cbf, depth, COLOR_Y)) {
@@ -1933,7 +1933,7 @@ void uvg_encode_mvd(encoder_state_t * const state,
 void uvg_get_sub_coeff(const coeff_t *dst, const coeff_t * const src, const int lcu_x, const int lcu_y, const int block_w, const int block_h, const int lcu_width)
 {
   // Take subset of coeff array
-  coeff_t* dst_ptr = dst;
+  coeff_t* dst_ptr = (coeff_t*)dst;
   const coeff_t* coeff_ptr = &src[lcu_x + lcu_y * lcu_width];
   for (int j = 0; j < block_h; ++j) {
     //memcpy(dst_coeff + (j * lcu_width), &coeff[j * tr_width], tr_width * sizeof(coeff_t));
diff --git a/src/rdo.c b/src/rdo.c
index c9f2db05..f7eb2a9e 100644
--- a/src/rdo.c
+++ b/src/rdo.c
@@ -320,7 +320,7 @@ static INLINE double get_coeff_cabac_cost(
   coeff_t sub_coeff[TR_MAX_WIDTH * TR_MAX_WIDTH];
 
   if (coeff_order == COEFF_ORDER_LINEAR) {
-    coeff_ptr = coeff;
+    coeff_ptr = (coeff_t*)coeff;
   }
   else {
     // Coeff order CU

From 4633cb33b5356a105066257e38151aa2c5494d95 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Fri, 7 Oct 2022 02:14:45 +0300
Subject: [PATCH 087/254] [isp] Fix error in mts tests. CU isp mode was not
 nullified before testing.

---
 tests/mts_tests.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tests/mts_tests.c b/tests/mts_tests.c
index e12de73e..b417aa35 100644
--- a/tests/mts_tests.c
+++ b/tests/mts_tests.c
@@ -111,6 +111,7 @@ static void setup_tests()
           tu.tr_idx = MTS_DST7_DST7 + trafo;
           tu.lfnst_idx = 0;
           tu.cr_lfnst_idx = 0;
+          tu.intra.isp_mode = 0;
           mts_generic(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + block), 1 << (LCU_MIN_LOG_W + block), dct_bufs[trafo*NUM_SIZES+block], dct_result[trafo][block], UVG_MTS_BOTH);
         }
       }      
@@ -134,6 +135,7 @@ static void setup_tests()
           tu.tr_idx = MTS_DST7_DST7 + trafo;
           tu.lfnst_idx = 0;
           tu.cr_lfnst_idx = 0;
+          tu.intra.isp_mode = 0;
           idct_generic(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + block), 1 << (LCU_MIN_LOG_W + block), dct_bufs[trafo * NUM_SIZES + block], idct_result[trafo][block], UVG_MTS_BOTH);
         }
       }
@@ -163,6 +165,7 @@ TEST dct(void)
       tu.tr_idx = MTS_DST7_DST7 + trafo;
       tu.lfnst_idx = 0;
       tu.cr_lfnst_idx = 0;
+      tu.intra.isp_mode = 0;
 
       int16_t* buf = dct_bufs[trafo * NUM_SIZES + blocksize];
       ALIGNED(32) int16_t test_result[LCU_WIDTH * LCU_WIDTH] = { 0 };
@@ -188,6 +191,9 @@ TEST idct(void)
       cu_info_t tu;
       tu.type = CU_INTRA;
       tu.tr_idx = MTS_DST7_DST7 + trafo;
+      tu.lfnst_idx = 0;
+      tu.cr_lfnst_idx = 0;
+      tu.intra.isp_mode = 0;
 
       int16_t* buf = dct_bufs[trafo * NUM_SIZES + blocksize];
       ALIGNED(32) int16_t test_result[LCU_WIDTH * LCU_WIDTH] = { 0 };

From b16c404362ac97cd8522b863e416bfbd64d945dc Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Tue, 11 Oct 2022 16:11:14 +0300
Subject: [PATCH 088/254] [isp] Remove some obsolete TODOs and old commented
 out code.

---
 src/strategies/generic/intra-generic.c | 51 --------------------------
 1 file changed, 51 deletions(-)

diff --git a/src/strategies/generic/intra-generic.c b/src/strategies/generic/intra-generic.c
index 98911e5f..e9a3ccf0 100644
--- a/src/strategies/generic/intra-generic.c
+++ b/src/strategies/generic/intra-generic.c
@@ -152,58 +152,18 @@ static void uvg_angular_pred_generic(
     ref_main = vertical_mode ? temp_above + height : temp_left + width;
     ref_side = vertical_mode ? temp_left + width : temp_above + height;
 
-    // TODO: for non square blocks, need to check if width or height is used for reference extension
     int size_side = vertical_mode ? height : width;
     for (int i = -size_side; i <= -1; i++) {
       ref_main[i] = ref_side[MIN((-i * modedisp2invsampledisp[abs(mode_disp)] + 256) >> 9, size_side)];
     }
-
-    //const uint32_t index_offset = width + 1;
-    //const int32_t last_index = width;
-    //const int_fast32_t most_negative_index = (width * sample_disp) >> 5;
-    //// Negative sample_disp means, we need to use both references.
-
-    //// TODO: update refs to take into account variating block size and shapes
-    ////       (height is not always equal to width)
-    //ref_side = (vertical_mode ? in_ref_left : in_ref_above) + 1;
-    //ref_main = (vertical_mode ? in_ref_above : in_ref_left) + 1;
-
-    //// Move the reference pixels to start from the middle to the later half of
-    //// the tmp_ref, so there is room for negative indices.
-    //for (int_fast32_t x = -1; x < width; ++x) {
-    //  tmp_ref[x + index_offset] = ref_main[x];
-    //}
-    //// Get a pointer to block index 0 in tmp_ref.
-    //ref_main = &tmp_ref[index_offset];
-    //tmp_ref[index_offset -1] = tmp_ref[index_offset];
-
-    //// Extend the side reference to the negative indices of main reference.
-    //int_fast32_t col_sample_disp = 128; // rounding for the ">> 8"
-    //int_fast16_t inv_abs_sample_disp = modedisp2invsampledisp[abs(mode_disp)];
-    //// TODO: add 'vertical_mode ? height : width' instead of 'width'
-    //
-    //for (int_fast32_t x = -1; x > most_negative_index; x--) {
-    //  col_sample_disp += inv_abs_sample_disp;
-    //  int_fast32_t side_index = col_sample_disp >> 8;
-    //  tmp_ref[x + index_offset - 1] = ref_side[side_index - 1];
-    //}
-    //tmp_ref[last_index + index_offset] = tmp_ref[last_index + index_offset - 1];
-    //tmp_ref[most_negative_index + index_offset - 1] = tmp_ref[most_negative_index + index_offset];
   }
   else {
-    
-    // TODO: again, separate loop needed for non-square blocks
-    /*for (int i = 0; i <= (width << 1) + multi_ref_index; i++) {
-      temp_main[i] = (vertical_mode ? in_ref_above[i] : in_ref_left[i]);
-      temp_side[i] = (vertical_mode ? in_ref_left[i] : in_ref_above[i]);
-    }*/
     memcpy(&temp_above[0], &in_ref_above[0], (top_ref_length + 1 + multi_ref_index) * sizeof(uvg_pixel));
     memcpy(&temp_left[0], &in_ref_left[0], (left_ref_lenght + 1 + multi_ref_index) * sizeof(uvg_pixel));
 
     ref_main = vertical_mode ? temp_above : temp_left;
     ref_side = vertical_mode ? temp_left : temp_above;
 
-    // TODO: this code block will need to change also when non-square blocks are used
     const int log2_ratio = log2_width - log2_height;
     const int s = MAX(0, vertical_mode ? log2_ratio : -log2_ratio);
     const int max_index = (multi_ref_index << s) + 2;
@@ -218,17 +178,6 @@ static void uvg_angular_pred_generic(
     for (int j = 1; j <= max_index; j++) {
       ref_main[ref_length + multi_ref_index +  j] = val;
     }
-    
-    //// sample_disp >= 0 means we don't need to refer to negative indices,
-    //// which means we can just use the references as is.
-    //ref_main = (vertical_mode ? in_ref_above : in_ref_left) + 1;
-    //ref_side = (vertical_mode ? in_ref_left : in_ref_above) + 1;
-
-    //memcpy(tmp_ref + width, ref_main, (width*2) * sizeof(uvg_pixel));
-    //ref_main = &tmp_ref[width];
-    //tmp_ref[width-1] = tmp_ref[width];
-    //int8_t last_index = 1 + width*2;
-    //tmp_ref[width + last_index] = tmp_ref[width + last_index - 1];
   }
 
   // Flip dimensions for horizontal modes

From 0ec16967a130c02700cb465536d9657c6d5d761f Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Wed, 12 Oct 2022 15:44:09 +0300
Subject: [PATCH 089/254] [isp] Fix reference building. When ISP was in use,
 not enough samples were generated. Uninitialized memory was referenced. Fix
 some typos.

---
 src/intra.c                            | 53 ++++++++++++++++++--------
 src/strategies/generic/intra-generic.c |  6 +--
 2 files changed, 40 insertions(+), 19 deletions(-)

diff --git a/src/intra.c b/src/intra.c
index a831f768..75f0c3a4 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -234,7 +234,6 @@ static void intra_filter_reference(
   filtered_ref->top[ref_width - 1] = ref->top[ref_width - 1];
 }
 
-
 /**
 * \brief Generate dc prediction.
 * \param cu_loc        CU location and size data.
@@ -254,7 +253,7 @@ static void intra_pred_dc(
 {
   const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
   const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
-
+  
   int_fast16_t sum = 0;
   // Only one loop is done for non-square blocks.
   // In case of non-square blocks, only the longer reference is summed.
@@ -1004,6 +1003,8 @@ void uvg_intra_build_reference_any(
   const int cu_x = cu_loc->x;
   const int cu_y = cu_loc->y;
 
+  bool is_first_isp_block = isp_mode ? pu_x == cu_x && pu_y == cu_y : false;
+
   assert((log2_width >= 1 && log2_width <= 5) && (log2_height >= 1 && log2_height <= 5));
 
   refs->filtered_initialized = false;
@@ -1071,7 +1072,7 @@ void uvg_intra_build_reference_any(
   if (luma_px->x > 0) {
     // Get the number of reference pixels based on the PU coordinate within the LCU.
     int px_available_left;
-    if (isp_mode && !is_chroma) {
+    if (isp_mode && !is_first_isp_block && !is_chroma) {
       if (isp_mode == ISP_MODE_VER) {
         px_available_left = height;
       }
@@ -1099,13 +1100,18 @@ void uvg_intra_build_reference_any(
     }
     // Extend the last pixel for the rest of the reference values.
     uvg_pixel nearest_pixel = left_border[(px_available_left - 1) * left_stride];
-    for (int i = px_available_left; i < cu_height * 2 + multi_ref_index * 2; ++i) {
+
+    // If first isp split, take samples as if it were normal square block
+    int tmp_h = is_first_isp_block ? cu_height * 2 : (isp_mode ? cu_height + height : height * 2);
+    for (int i = px_available_left; i < tmp_h + multi_ref_index * 2; ++i) {
       out_left_ref[i + 1 + multi_ref_index] = nearest_pixel;
     }
   } else {
     // If we are on the left edge, extend the first pixel of the top row.
     uvg_pixel nearest_pixel = luma_px->y > 0 ? top_border[0] : dc_val;
-    for (int i = 0; i < height * 2 + multi_ref_index; i++) {
+    // If first isp split, take samples as if it were normal square block
+    int tmp_h = is_first_isp_block ? cu_height * 2 : (isp_mode ? cu_height + height : height * 2);
+    for (int i = 0; i < tmp_h + multi_ref_index; i++) {
       // Reserve space for top left reference
       out_left_ref[i + 1 + multi_ref_index] = nearest_pixel;
     }
@@ -1191,7 +1197,7 @@ void uvg_intra_build_reference_any(
   int px_available_top;
   if (luma_px->y > 0) {
     // Get the number of reference pixels based on the PU coordinate within the LCU.
-    if (isp_mode && !is_chroma) {
+    if (isp_mode && !is_first_isp_block && !is_chroma) {
       if (isp_mode == ISP_MODE_HOR) {
         px_available_top = width;
       }
@@ -1214,13 +1220,19 @@ void uvg_intra_build_reference_any(
     }
     // Extend the last pixel for the rest of the reference values.
     uvg_pixel nearest_pixel = top_border[px_available_top - 1];
-    for (int i = px_available_top; i < width * 2 + multi_ref_index * 2; ++i) {
+
+    // If first isp split, take samples as if it were normal square block
+    int tmp_w = is_first_isp_block ? cu_width * 2 : (isp_mode ? cu_width + width : width * 2);
+    for (int i = px_available_top; i < tmp_w + multi_ref_index * 2; ++i) {
       out_top_ref[i + 1 + multi_ref_index] = nearest_pixel;
     }
   } else {
     // Extend nearest pixel.
     uvg_pixel nearest_pixel = luma_px->x > 0 ? left_border[0] : dc_val;
-    for (int i = 0; i < cu_width * 2 + multi_ref_index; i++) {
+
+    // If first isp split, take samples as if it were normal square block
+    int tmp_w = is_first_isp_block ? cu_width * 2 : (isp_mode ? cu_width + width : width * 2);
+    for (int i = 0; i < tmp_w + multi_ref_index * 2; i++) {
       out_top_ref[i + 1] = nearest_pixel;
     }
   }
@@ -1252,6 +1264,8 @@ void uvg_intra_build_reference_inner(
   const int cu_x = cu_loc->x;
   const int cu_y = cu_loc->y;
 
+  bool is_first_isp_block = isp_mode ? pu_x == cu_x && pu_y == cu_y : false;
+
   // Log2_dim 1 is possible with ISP blocks
   assert((log2_width >= 1 && log2_width <= 5) && (log2_height >= 1 && log2_height <= 5));
 
@@ -1361,7 +1375,7 @@ void uvg_intra_build_reference_inner(
 
   // Get the number of reference pixels based on the PU coordinate within the LCU.
   int px_available_left;
-  if (isp_mode && !is_chroma) {
+  if (isp_mode && !is_first_isp_block && !is_chroma) {
     if (isp_mode == ISP_MODE_VER) {
       px_available_left = height;
     }
@@ -1406,7 +1420,10 @@ void uvg_intra_build_reference_inner(
 
   // Extend the last pixel for the rest of the reference values.
   uvg_pixel nearest_pixel = out_left_ref[i];
-  for (; i < cu_height * 2; i += 4) {
+
+  // If first isp split, take samples as if it were normal square block
+  int tmp_h = is_first_isp_block ? cu_height * 2 : (isp_mode ? cu_height + height : height * 2);
+  for (; i < tmp_h; i += 4) {
     out_left_ref[i + 1] = nearest_pixel;
     out_left_ref[i + 2] = nearest_pixel;
     out_left_ref[i + 3] = nearest_pixel;
@@ -1424,7 +1441,7 @@ void uvg_intra_build_reference_inner(
 
   // Get the number of reference pixels based on the PU coordinate within the LCU.
   int px_available_top;
-  if (isp_mode && !is_chroma) {
+  if (isp_mode && !is_first_isp_block && !is_chroma) {
     if (isp_mode == ISP_MODE_HOR) {
       px_available_top = width;
     }
@@ -1452,7 +1469,10 @@ void uvg_intra_build_reference_inner(
 
   // Extend the last pixel for the rest of the reference values.
   nearest_pixel = out_top_ref[i + multi_ref_index];
-  for (; i < (cu_width + multi_ref_index) * 2; i += 4) {
+
+  // If first isp split, take samples as if it were normal square block
+  int tmp_w = is_first_isp_block ? cu_width * 2 : (isp_mode ? cu_width + width : width * 2);
+  for (; i < tmp_w + (multi_ref_index * 2); i += 4) {
     out_top_ref[i + 1 + multi_ref_index] = nearest_pixel;
     out_top_ref[i + 2 + multi_ref_index] = nearest_pixel;
     out_top_ref[i + 3 + multi_ref_index] = nearest_pixel;
@@ -1476,14 +1496,14 @@ void uvg_intra_build_reference(
 {
   assert(!(extra_ref_lines == NULL && multi_ref_idx != 0) && "Trying to use MRL with NULL extra references.");
 
-  bool first_split = color == COLOR_Y && isp_mode && pu_loc->x == cu_loc->x && pu_loc->y == cu_loc->y;
-  uint8_t isp = first_split ? 0 : isp_mode;
+  //bool first_split = color == COLOR_Y && isp_mode && pu_loc->x == cu_loc->x && pu_loc->y == cu_loc->y;
+  //uint8_t isp = first_split ? 0 : isp_mode;
 
   // Much logic can be discarded if not on the edge
   if (luma_px->x > 0 && luma_px->y > 0) {
-    uvg_intra_build_reference_inner(pu_loc, cu_loc, color, luma_px, pic_px, lcu, refs, entropy_sync, multi_ref_idx, extra_ref_lines, isp);
+    uvg_intra_build_reference_inner(pu_loc, cu_loc, color, luma_px, pic_px, lcu, refs, entropy_sync, multi_ref_idx, extra_ref_lines, isp_mode);
   } else {
-    uvg_intra_build_reference_any(pu_loc, cu_loc, color, luma_px, pic_px, lcu, refs, multi_ref_idx, extra_ref_lines, isp);
+    uvg_intra_build_reference_any(pu_loc, cu_loc, color, luma_px, pic_px, lcu, refs, multi_ref_idx, extra_ref_lines, isp_mode);
   }
 }
 
@@ -1676,6 +1696,7 @@ static void intra_recon_tb_leaf(
   uint8_t isp_mode = color == COLOR_Y ? search_data->pred_cu.intra.isp_mode : 0;
 
   uvg_intra_references refs;
+
   // Extra reference lines for use with MRL. Extra lines needed only for left edge.
   uvg_pixel extra_refs[128 * MAX_REF_LINE_IDX] = { 0 };
 
diff --git a/src/strategies/generic/intra-generic.c b/src/strategies/generic/intra-generic.c
index e9a3ccf0..a0bd430d 100644
--- a/src/strategies/generic/intra-generic.c
+++ b/src/strategies/generic/intra-generic.c
@@ -141,7 +141,7 @@ static void uvg_angular_pred_generic(
 
   const int cu_dim = MAX(width, height);
   const int top_ref_length  = isp_mode ? width + cu_dim  : width << 1;
-  const int left_ref_lenght = isp_mode ? height + cu_dim : height << 1;
+  const int left_ref_length = isp_mode ? height + cu_dim : height << 1;
 
   // Set ref_main and ref_side such that, when indexed with 0, they point to
   // index 0 in block coordinates.
@@ -159,7 +159,7 @@ static void uvg_angular_pred_generic(
   }
   else {
     memcpy(&temp_above[0], &in_ref_above[0], (top_ref_length + 1 + multi_ref_index) * sizeof(uvg_pixel));
-    memcpy(&temp_left[0], &in_ref_left[0], (left_ref_lenght + 1 + multi_ref_index) * sizeof(uvg_pixel));
+    memcpy(&temp_left[0], &in_ref_left[0], (left_ref_length + 1 + multi_ref_index) * sizeof(uvg_pixel));
 
     ref_main = vertical_mode ? temp_above : temp_left;
     ref_side = vertical_mode ? temp_left : temp_above;
@@ -169,7 +169,7 @@ static void uvg_angular_pred_generic(
     const int max_index = (multi_ref_index << s) + 2;
     int ref_length;
     if (isp_mode) {
-      ref_length = vertical_mode ? top_ref_length : left_ref_lenght;
+      ref_length = vertical_mode ? top_ref_length : left_ref_length;
     }
     else {
       ref_length = vertical_mode ? width << 1 : height << 1;

From 26dcadc149e4e67488df585733836c61c0811e81 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Tue, 6 Sep 2022 13:45:08 +0300
Subject: [PATCH 090/254] [mtt] change most if not all of search hierarchy to
 use cu_loc_t

---
 src/cu.h                               |  69 ++---
 src/encode_coding_tree.c               |  65 +++--
 src/encode_coding_tree.h               |  19 +-
 src/filter.c                           |  23 +-
 src/inter.c                            | 357 ++++++++++++-------------
 src/inter.h                            |  84 +++---
 src/search.c                           | 183 ++++++-------
 src/search_ibc.c                       | 212 +++++++--------
 src/search_ibc.h                       |   2 +-
 src/search_inter.c                     | 243 ++++++++---------
 src/search_inter.h                     |  18 +-
 src/strategies/avx2/intra-avx2.c       |   7 -
 src/strategies/avx2/quant-avx2.c       |   1 -
 src/strategies/generic/quant-generic.c |   2 -
 src/transform.c                        |   2 +-
 src/transform.h                        |   2 +-
 tests/mv_cand_tests.c                  |   7 +-
 17 files changed, 579 insertions(+), 717 deletions(-)

diff --git a/src/cu.h b/src/cu.h
index ecb7c695..1d49d347 100644
--- a/src/cu.h
+++ b/src/cu.h
@@ -77,55 +77,6 @@ typedef enum {
   MTS_TR_NUM    = 6,
 } mts_idx;
 
-extern const uint8_t uvg_part_mode_num_parts[];
-extern const uint8_t uvg_part_mode_offsets[][4][2];
-extern const uint8_t uvg_part_mode_sizes[][4][2];
-
-/**
- * \brief Get the x coordinate of a PU.
- *
- * \param part_mode   partition mode of the containing CU
- * \param cu_width    width of the containing CU
- * \param cu_x        x coordinate of the containing CU
- * \param i           number of the PU
- * \return            location of the left edge of the PU
- */
-#define PU_GET_X(part_mode, cu_width, cu_x, i) \
-  ((cu_x) + uvg_part_mode_offsets[(part_mode)][(i)][0] * (cu_width) / 4)
-
-/**
- * \brief Get the y coordinate of a PU.
- *
- * \param part_mode   partition mode of the containing CU
- * \param cu_width    width of the containing CU
- * \param cu_y        y coordinate of the containing CU
- * \param i           number of the PU
- * \return            location of the top edge of the PU
- */
-#define PU_GET_Y(part_mode, cu_width, cu_y, i) \
-  ((cu_y) + uvg_part_mode_offsets[(part_mode)][(i)][1] * (cu_width) / 4)
-
-/**
- * \brief Get the width of a PU.
- *
- * \param part_mode   partition mode of the containing CU
- * \param cu_width    width of the containing CU
- * \param i           number of the PU
- * \return            width of the PU
- */
-#define PU_GET_W(part_mode, cu_width, i) \
-  (uvg_part_mode_sizes[(part_mode)][(i)][0] * (cu_width) / 4)
-
-/**
- * \brief Get the height of a PU.
- *
- * \param part_mode   partition mode of the containing CU
- * \param cu_width    width of the containing CU
- * \param i           number of the PU
- * \return            height of the PU
- */
-#define PU_GET_H(part_mode, cu_width, i) \
-  (uvg_part_mode_sizes[(part_mode)][(i)][1] * (cu_width) / 4)
 
 //////////////////////////////////////////////////////////////////////////
 // TYPES
@@ -142,6 +93,25 @@ enum uvg_tree_type {
   UVG_CHROMA_T = 2
 };
 
+enum split_type {
+  NO_SPLIT = 0,
+  QT_SPLIT = 1,
+  BT_HOR_SPLIT = 2,
+  BT_VER_SPLIT = 3,
+  TT_HOR_SPLIT = 4,
+  TT_VER_SPLIT = 5,
+};
+
+typedef struct  {
+  uint32_t split_tree;
+  uint8_t current_depth;
+} split_tree_t;
+
+
+// Split for each depth takes three bits like xxy where if either x bit is set
+// it is a MTT split, and if there are any MTT split QT split is not allowed
+#define CAN_QT_SPLIT(x) (((x) & 6DB6DB6) == 0)
+
 /**
  * \brief Struct for CU info
  */
@@ -149,7 +119,6 @@ typedef struct
 {
   uint8_t type        : 3; //!< \brief block type, one of cu_type_t values
   uint8_t depth       : 3; //!< \brief depth / size of this block
-  uint8_t part_size   : 3; //!< \brief partition mode, one of part_mode_t values
   uint8_t tr_depth    : 3; //!< \brief transform depth
   uint8_t skipped     : 1; //!< \brief flag to indicate this block is skipped
   uint8_t merged      : 1; //!< \brief flag to indicate this block is merged
diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index 46552a12..6f6fc9d8 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -825,11 +825,14 @@ static void encode_transform_coeff(
  * \param depth           Depth from LCU.
  * \return if non-zero mvd is coded
  */
-int uvg_encode_inter_prediction_unit(encoder_state_t * const state,
-                                      cabac_data_t * const cabac,
-                                      const cu_info_t * const cur_cu,
-                                      int x, int y, int width, int height,
-                                      int depth, lcu_t* lcu, double* bits_out)
+int uvg_encode_inter_prediction_unit(
+  encoder_state_t * const state,
+  cabac_data_t * const cabac,
+  const cu_info_t * const cur_cu,
+  int depth,
+  lcu_t* lcu,
+  double* bits_out,
+  const cu_loc_t* const cu_loc)
 {
   // Mergeflag
   int16_t num_cand = 0;
@@ -864,8 +867,8 @@ int uvg_encode_inter_prediction_unit(encoder_state_t * const state,
       // Code Inter Dir
       uint8_t inter_dir = cur_cu->inter.mv_dir;
 
-      if (cur_cu->part_size == SIZE_2Nx2N || (LCU_WIDTH >> depth) != 4) { // ToDo: limit on 4x8/8x4
-        uint32_t inter_dir_ctx = (7 - ((uvg_math_floor_log2(width) + uvg_math_floor_log2(height) + 1) >> 1));
+      if ((LCU_WIDTH >> depth) != 4) { // ToDo: limit on 4x8/8x4
+        uint32_t inter_dir_ctx = (7 - ((uvg_math_floor_log2(cu_loc->width) + uvg_math_floor_log2(cu_loc->height) + 1) >> 1));
 
         CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.inter_dir[inter_dir_ctx]), (inter_dir == 3), bits, "inter_pred_idc");
       }
@@ -916,16 +919,14 @@ int uvg_encode_inter_prediction_unit(encoder_state_t * const state,
         if (lcu) {
           uvg_inter_get_mv_cand(
             state, 
-            x, y, width, height,
-            mv_cand, cur_cu, 
-            lcu, ref_list_idx);
+            mv_cand, cur_cu, lcu, ref_list_idx,
+            cu_loc);
         }
         else {
           uvg_inter_get_mv_cand_cua(
             state,
-            x, y, width, height,
-            mv_cand, cur_cu, ref_list_idx
-          );
+            mv_cand, cur_cu, ref_list_idx, cu_loc
+            );
         }
 
         uint8_t cu_mv_cand = CU_GET_MV_CAND(cur_cu, ref_list_idx);
@@ -1346,11 +1347,11 @@ bool uvg_write_split_flag(
   if (no_split && allow_split) {
     // Get left and top block split_flags and if they are present and true, increase model number
     // ToDo: should use height and width to increase model, PU_GET_W() ?
-    if (left_cu && PU_GET_H(left_cu->part_size, LCU_WIDTH >> left_cu->depth, 0) < LCU_WIDTH >> depth) {
+    if (left_cu && LCU_WIDTH >> left_cu->depth < LCU_WIDTH >> depth) {
       split_model++;
     }
 
-    if (above_cu && PU_GET_W(above_cu->part_size, LCU_WIDTH >> above_cu->depth, 0) < LCU_WIDTH >> depth) {
+    if (above_cu && LCU_WIDTH >> above_cu->depth < LCU_WIDTH >> depth) {
       split_model++;
     }
 
@@ -1625,22 +1626,15 @@ void uvg_encode_coding_tree(
 
   if (cur_cu->type == CU_INTER || cur_cu->type == CU_IBC) {
     uint8_t imv_mode = UVG_IMV_OFF;
-    
-    const int num_pu = uvg_part_mode_num_parts[cur_cu->part_size];
     bool non_zero_mvd = false;
+  
+    // TODO: height for non-square blocks
+    const cu_info_t *cur_pu = uvg_cu_array_at_const(used_array, cu_loc.x, cu_loc.y);
 
-    for (int i = 0; i < num_pu; ++i) {
-      // TODO: height for non-square blocks
-      const int pu_x = PU_GET_X(cur_cu->part_size, cu_width, x, i);
-      const int pu_y = PU_GET_Y(cur_cu->part_size, cu_width, y, i);
-      const int pu_w = PU_GET_W(cur_cu->part_size, cu_width, i);
-      const int pu_h = PU_GET_H(cur_cu->part_size, cu_width, i);
-      const cu_info_t *cur_pu = uvg_cu_array_at_const(used_array, pu_x, pu_y);
-
-      non_zero_mvd |= uvg_encode_inter_prediction_unit(state, cabac, cur_pu, pu_x, pu_y, pu_w, pu_h, depth, NULL, NULL);
-      DBG_PRINT_MV(state, pu_x, pu_y, pu_w, pu_h, cur_pu);
-      uvg_hmvp_add_mv(state, x, y, pu_w, pu_h, cur_pu);
-    }
+    non_zero_mvd |= uvg_encode_inter_prediction_unit(state, cabac, cur_pu, depth, NULL, NULL, &cu_loc);
+    DBG_PRINT_MV(state, pu_x, pu_y, pu_w, pu_h, cur_pu);
+    uvg_hmvp_add_mv(state, x, y, width, height, cur_pu);
+    
 
     // imv mode, select between fullpel, half-pel and quarter-pel resolutions
     // 0 = off, 1 = fullpel, 2 = 4-pel, 3 = half-pel
@@ -1661,7 +1655,7 @@ void uvg_encode_coding_tree(
       int cbf = cbf_is_set_any(cur_cu->cbf, depth);
       // Only need to signal coded block flag if not skipped or merged
       // skip = no coded residual, merge = coded residual
-      if (cur_cu->part_size != SIZE_2Nx2N || !cur_cu->merged) {
+      if (!cur_cu->merged) {
         cabac->cur_ctx = &(cabac->ctx.cu_qt_root_cbf_model);
         CABAC_BIN(cabac, cbf, "rqt_root_cbf");
       }
@@ -1747,15 +1741,18 @@ end:
 double uvg_mock_encode_coding_unit(
   encoder_state_t* const state,
   cabac_data_t* cabac,
-  int x,
-  int y,
-  int depth,
+  const cu_loc_t* const cu_loc,
   lcu_t* lcu,
   cu_info_t* cur_cu,
   enum uvg_tree_type tree_type) {
   double bits = 0;
   const encoder_control_t* const ctrl = state->encoder_control;
 
+  const int x = cu_loc->x;
+  const int y = cu_loc->y;
+
+  const uint8_t depth = 6 - uvg_g_convert_to_log2[cu_loc->width];
+
   int x_local = SUB_SCU(x) >> (tree_type == UVG_CHROMA_T);
   int y_local = SUB_SCU(y) >> (tree_type == UVG_CHROMA_T);
 
@@ -1846,7 +1843,7 @@ double uvg_mock_encode_coding_unit(
   
   if (cur_cu->type == CU_INTER || cur_cu->type == CU_IBC) {
     const uint8_t imv_mode = UVG_IMV_OFF;
-    const int non_zero_mvd = uvg_encode_inter_prediction_unit(state, cabac, cur_cu, x, y, cu_width, cu_width, depth, lcu, &bits);
+    const int non_zero_mvd = uvg_encode_inter_prediction_unit(state, cabac, cur_cu, depth, lcu, &bits, cu_loc);
     if (ctrl->cfg.amvr && non_zero_mvd) {
       CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.imv_flag[0]), imv_mode, bits, "imv_flag");
       if (imv_mode > UVG_IMV_OFF) {
diff --git a/src/encode_coding_tree.h b/src/encode_coding_tree.h
index 575f4afd..231e22ff 100644
--- a/src/encode_coding_tree.h
+++ b/src/encode_coding_tree.h
@@ -78,20 +78,19 @@ void uvg_encode_mvd(encoder_state_t * const state,
 double uvg_mock_encode_coding_unit(
   encoder_state_t* const state,
   cabac_data_t* cabac,
-  int x,
-  int y,
-  int depth,
+  const cu_loc_t* const cu_loc,
   lcu_t* lcu,
   cu_info_t* cur_cu,
   enum uvg_tree_type tree_type);
 
-int uvg_encode_inter_prediction_unit(encoder_state_t* const state,
-                                      cabac_data_t* const cabac,
-                                      const cu_info_t* const cur_cu,
-                                      int x, int y, int width, int height,
-                                      int depth, 
-                                      lcu_t* lcu,
-                                      double* bits_out);
+int uvg_encode_inter_prediction_unit(
+  encoder_state_t* const state,
+  cabac_data_t* const cabac,
+  const cu_info_t* const cur_cu,
+  int depth,
+  lcu_t* lcu,
+  double* bits_out,
+  const cu_loc_t* const cu_loc);
 
 void uvg_encode_intra_luma_coding_unit(const encoder_state_t* const state,
   cabac_data_t* const cabac,
diff --git a/src/filter.c b/src/filter.c
index 2d51a17c..26a57100 100644
--- a/src/filter.c
+++ b/src/filter.c
@@ -855,13 +855,11 @@ static void filter_deblock_edge_luma(encoder_state_t * const state,
       uint8_t max_filter_length_P = 0;
       uint8_t max_filter_length_Q = 0;
       const int cu_size = LCU_WIDTH >> cu_q->depth;
-      const int pu_part_idx = (y + PU_GET_H(cu_q->part_size, cu_size, 0) <= y_coord ? 
-                               1 + (uvg_part_mode_num_parts[cu_q->part_size] >> 2) : 0)
-                            + (x + PU_GET_W(cu_q->part_size, cu_size, 0) <= x_coord ? 1 : 0);
-      const int pu_size = dir == EDGE_HOR ? PU_GET_H(cu_q->part_size, cu_size, pu_part_idx)
-                                          : PU_GET_W(cu_q->part_size, cu_size, pu_part_idx);
-      const int pu_pos = dir == EDGE_HOR ? y_coord - PU_GET_Y(cu_q->part_size, cu_size, 0, pu_part_idx) 
-                                         : x_coord - PU_GET_X(cu_q->part_size, cu_size, 0, pu_part_idx);
+      // TODO: NON square
+      const int pu_size = dir == EDGE_HOR ? cu_size
+                                          : cu_size;
+      const int pu_pos = dir == EDGE_HOR ? y_coord 
+                                         : x_coord;
       get_max_filter_length(&max_filter_length_P, &max_filter_length_Q, state, x_coord, y_coord,
                             dir, tu_boundary,
                             LCU_WIDTH >> cu_p->tr_depth,
@@ -1088,13 +1086,10 @@ static void filter_deblock_edge_chroma(encoder_state_t * const state,
       }
 
       const int cu_size = LCU_WIDTH >> (cu_q->depth + (tree_type == UVG_CHROMA_T));
-      const int pu_part_idx = ((y << (tree_type != UVG_CHROMA_T)) + PU_GET_H(cu_q->part_size, cu_size, 0) <= y_coord ?
-                               1 + (uvg_part_mode_num_parts[cu_q->part_size] >> 2) : 0)
-                              + ((x << (tree_type != UVG_CHROMA_T)) + PU_GET_W(cu_q->part_size, cu_size, 0) <= x_coord ? 1 : 0);
-      const int pu_size = dir == EDGE_HOR ? PU_GET_H(cu_q->part_size, cu_size, pu_part_idx)
-                                          : PU_GET_W(cu_q->part_size, cu_size, pu_part_idx);
-      const int pu_pos = dir == EDGE_HOR ? y_coord - PU_GET_Y(cu_q->part_size, cu_size, 0, pu_part_idx)
-                                         : x_coord - PU_GET_X(cu_q->part_size, cu_size, 0, pu_part_idx);
+      // TODO: non-square
+      const int pu_size = dir == EDGE_HOR ? cu_size : cu_size;
+      const int pu_pos = dir == EDGE_HOR ? y_coord
+                                         : x_coord;
       uint8_t max_filter_length_P = 0;
       uint8_t max_filter_length_Q = 0;
       
diff --git a/src/inter.c b/src/inter.c
index 3bbef427..be353506 100644
--- a/src/inter.c
+++ b/src/inter.c
@@ -375,23 +375,26 @@ static void inter_cp_with_ext_border(const uvg_pixel *ref_buf, int ref_stride,
  * \param predict_luma   Enable or disable luma prediction for this call.
  * \param predict_chroma Enable or disable chroma prediction for this call.
 */
-static unsigned inter_recon_unipred(const encoder_state_t * const state,
-                                    const uvg_picture * const ref,
-                                    int32_t pu_x,
-                                    int32_t pu_y,
-                                    int32_t pu_w,
-                                    int32_t pu_h,
-                                    int32_t out_stride_luma,
-                                    const mv_t mv_param[2],
-                                    yuv_t *yuv_px,
-                                    yuv_im_t *yuv_im,
-                                    bool predict_luma,
-                                    bool predict_chroma)
+static unsigned inter_recon_unipred(
+  const encoder_state_t * const state,
+  const uvg_picture * const ref,
+  int32_t out_stride_luma,
+  const mv_t mv_param[2],
+  yuv_t *yuv_px,
+  yuv_im_t *yuv_im,
+  bool predict_luma,
+  bool predict_chroma,
+  const cu_loc_t* const cu_loc)
 {
   vector2d_t int_mv = { mv_param[0], mv_param[1] };
 
   uvg_change_precision_vector2d(INTERNAL_MV_PREC, 0, &int_mv);
 
+  const int pu_x = cu_loc->x;
+  const int pu_y = cu_loc->y;
+  const int pu_w = cu_loc->width;
+  const int pu_h = cu_loc->height;
+
   const vector2d_t int_mv_in_frame = {
     int_mv.x + pu_x + state->tile->offset_x,
     int_mv.y + pu_y + state->tile->offset_y
@@ -507,17 +510,15 @@ static unsigned inter_recon_unipred(const encoder_state_t * const state,
  * \param predict_luma   Enable or disable luma prediction for this call.
  * \param predict_chroma Enable or disable chroma prediction for this call.
  */
-void uvg_inter_recon_bipred(const encoder_state_t *const state,
+void uvg_inter_recon_bipred(
+  const encoder_state_t *const state,
   const uvg_picture *ref1,
   const uvg_picture *ref2,
-  int32_t pu_x,
-  int32_t pu_y,
-  int32_t pu_w,
-  int32_t pu_h,
   mv_t mv_param[2][2],
   lcu_t *lcu,
   bool predict_luma,
-  bool predict_chroma)
+  bool predict_chroma,
+  const cu_loc_t* const cu_loc)
 {
   // Allocate maximum size arrays for interpolated and copied samples
   ALIGNED(64) uvg_pixel px_buf_L0[LCU_LUMA_SIZE + 2 * LCU_CHROMA_SIZE];
@@ -525,6 +526,11 @@ void uvg_inter_recon_bipred(const encoder_state_t *const state,
   ALIGNED(64) uvg_pixel_im im_buf_L0[LCU_LUMA_SIZE + 2 * LCU_CHROMA_SIZE];
   ALIGNED(64) uvg_pixel_im im_buf_L1[LCU_LUMA_SIZE + 2 * LCU_CHROMA_SIZE];
 
+  const int pu_x = cu_loc->x;
+  const int pu_y = cu_loc->y;
+  const int pu_w = cu_loc->width;
+  const int pu_h = cu_loc->height;
+
   yuv_t px_L0;
   px_L0.size = pu_w * pu_h;
   px_L0.y = &px_buf_L0[0];
@@ -551,10 +557,10 @@ void uvg_inter_recon_bipred(const encoder_state_t *const state,
 
   // Sample blocks from both reference picture lists.
   // Flags state if the outputs were written to high-precision / interpolated sample buffers.
-  unsigned im_flags_L0 = inter_recon_unipred(state, ref1, pu_x, pu_y, pu_w, pu_h, pu_w, mv_param[0],
-                                             &px_L0, &im_L0, predict_luma, predict_chroma);
-  unsigned im_flags_L1 = inter_recon_unipred(state, ref2, pu_x, pu_y, pu_w, pu_h, pu_w, mv_param[1],
-                                             &px_L1, &im_L1, predict_luma, predict_chroma);
+  unsigned im_flags_L0 = inter_recon_unipred(state, ref1, pu_w, mv_param[0], &px_L0, &im_L0, predict_luma, predict_chroma,
+                                             cu_loc);
+  unsigned im_flags_L1 = inter_recon_unipred(state, ref2, pu_w, mv_param[1], &px_L1, &im_L1, predict_luma, predict_chroma,
+                                             cu_loc);
 
   // After reconstruction, merge the predictors by taking an average of each pixel
   uvg_bipred_average(lcu, &px_L0, &px_L1, &im_L0, &im_L1,
@@ -578,19 +584,14 @@ void uvg_inter_recon_bipred(const encoder_state_t *const state,
  * \param predict_luma   Enable or disable luma prediction for this call.
  * \param predict_chroma Enable or disable chroma prediction for this call.
  */
-void uvg_inter_recon_cu(const encoder_state_t * const state,
-                        lcu_t *lcu,
-                        int32_t x,
-                        int32_t y,
-                        int32_t width,
-                        bool predict_luma,
-                        bool predict_chroma)
+void uvg_inter_recon_cu(
+  const encoder_state_t * const state,
+  lcu_t *lcu,
+  bool predict_luma,
+  bool predict_chroma,
+  const cu_loc_t* const cu_loc)
 {
-  cu_info_t *cu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y));
-  const int num_pu = uvg_part_mode_num_parts[cu->part_size];
-  for (int i = 0; i < num_pu; ++i) {
-    uvg_inter_pred_pu(state, lcu, x, y, width, predict_luma, predict_chroma, i);
-  }
+  uvg_inter_pred_pu(state, lcu, predict_luma, predict_chroma, cu_loc);  
 }
 
 static void ibc_recon_cu(const encoder_state_t * const state,
@@ -599,8 +600,7 @@ static void ibc_recon_cu(const encoder_state_t * const state,
                          int32_t y,
                          int32_t width,
                          bool predict_luma,
-                         bool predict_chroma,
-                         int i_pu)
+                         bool predict_chroma)
 {
   const int x_scu    = SUB_SCU(x);
   const int y_scu    = SUB_SCU(y);
@@ -668,79 +668,63 @@ static void ibc_recon_cu(const encoder_state_t * const state,
  * \param predict_chroma Enable or disable chroma prediction for this call.
  * \param i_pu           Index of the PU. Always zero for 2Nx2N. Used for SMP+AMP.
  */
-void uvg_inter_pred_pu(const encoder_state_t * const state,
-                       lcu_t *lcu,
-                       int32_t x,
-                       int32_t y,
-                       int32_t width,
-                       bool predict_luma,
-                       bool predict_chroma,
-                       int i_pu)
+void uvg_inter_pred_pu(
+  const encoder_state_t * const state,
+  lcu_t *lcu,
+  bool predict_luma,
+  bool predict_chroma,
+  const cu_loc_t* const cu_loc)
 
 {
-  const int x_scu = SUB_SCU(x);
-  const int y_scu = SUB_SCU(y);
-  cu_info_t *cu = LCU_GET_CU_AT_PX(lcu, x_scu, y_scu);
-  const int pu_x = PU_GET_X(cu->part_size, width, x, i_pu);
-  const int pu_y = PU_GET_Y(cu->part_size, width, y, i_pu);
-  const int pu_w = PU_GET_W(cu->part_size, width, i_pu);
-  const int pu_h = PU_GET_H(cu->part_size, width, i_pu);
-  cu_info_t *pu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(pu_x), SUB_SCU(pu_y));
+  const int x_scu = SUB_SCU(cu_loc->x);
+  const int y_scu = SUB_SCU(cu_loc->y);
+  cu_info_t *pu = LCU_GET_CU_AT_PX(lcu, x_scu, y_scu);
 
-  if (cu->type == CU_IBC) {
-    ibc_recon_cu(state, lcu, x, y, width, predict_luma, predict_chroma, i_pu);
-  } else {
+  if (pu->inter.mv_dir == 3) {
+    const uvg_picture *const refs[2] = {
+      state->frame->ref->images[
+        state->frame->ref_LX[0][
+          pu->inter.mv_ref[0]]],
+      state->frame->ref->images[
+        state->frame->ref_LX[1][
+          pu->inter.mv_ref[1]]],
+    };
+    uvg_inter_recon_bipred(state,
+                           refs[0], refs[1],
+                           pu->inter.mv, lcu,
+                           predict_luma, predict_chroma,
+                           cu_loc);
+  }
+  else if (pu->type == CU_IBC) {
+    ibc_recon_cu(state, lcu, cu_loc->x, cu_loc->y, cu_loc->width, predict_luma, predict_chroma);
+  } else{
+    const int mv_idx = pu->inter.mv_dir - 1;
+    const uvg_picture *const ref =
+      state->frame->ref->images[
+        state->frame->ref_LX[mv_idx][
+          pu->inter.mv_ref[mv_idx]]];
 
-    if (pu->inter.mv_dir == 3) {
-      const uvg_picture * const refs[2] = {
-        state->frame->ref->images[state->frame->ref_LX[0][pu->inter.mv_ref[0]]],
-        state->frame->ref->images[state->frame->ref_LX[1][pu->inter.mv_ref[1]]],
-      };
-      uvg_inter_recon_bipred(
-        state,
-        refs[0],
-        refs[1],
-        pu_x,
-        pu_y,
-        pu_w,
-        pu_h,
-        pu->inter.mv,
-        lcu,
-        predict_luma,
-        predict_chroma);
-    } else {
-      const int                 mv_idx = pu->inter.mv_dir - 1;
-      const uvg_picture * const ref = 
-        state->frame->ref->images[state->frame->ref_LX[mv_idx][pu->inter.mv_ref[mv_idx]]];
+    const unsigned offset_luma = SUB_SCU(cu_loc->y) * LCU_WIDTH + SUB_SCU(cu_loc->x);
+    const unsigned offset_chroma = SUB_SCU(cu_loc->y) / 2 * LCU_WIDTH_C + SUB_SCU(cu_loc->x) / 2;
+    yuv_t lcu_adapter;
+    lcu_adapter.size = cu_loc->width * cu_loc->height;
+    lcu_adapter.y = lcu->rec.y + offset_luma,
+    lcu_adapter.u = lcu->rec.u + offset_chroma,
+    lcu_adapter.v = lcu->rec.v + offset_chroma,
 
-      const unsigned offset_luma = SUB_SCU(pu_y) * LCU_WIDTH + SUB_SCU(pu_x);
-      const unsigned offset_chroma =
-        SUB_SCU(pu_y) / 2 * LCU_WIDTH_C + SUB_SCU(pu_x) / 2;
-      yuv_t lcu_adapter;
-      lcu_adapter.size = pu_w * pu_h;
-      lcu_adapter.y    = lcu->rec.y + offset_luma,
-      lcu_adapter.u    = lcu->rec.u + offset_chroma,
-      lcu_adapter.v    = lcu->rec.v + offset_chroma,
-
-      inter_recon_unipred(
-        state,
-        ref,
-        pu_x,
-        pu_y,
-        pu_w,
-        pu_h,
-        LCU_WIDTH,
-        pu->inter.mv[mv_idx],
-        &lcu_adapter,
-        NULL,
-        predict_luma,
-        predict_chroma);
-    }
+    inter_recon_unipred(state,
+                        ref,
+                        LCU_WIDTH, pu->inter.mv[mv_idx],
+                        &lcu_adapter,
+                        NULL,
+                        predict_luma,
+                        predict_chroma,
+                        cu_loc);
   }
   if (predict_chroma && state->encoder_control->cfg.jccr) {
     const int offset = x_scu / 2 + y_scu / 2 * LCU_WIDTH_C;
-    uvg_pixels_blit(lcu->rec.u + offset, lcu->rec.joint_u + offset, width / 2, width / 2, LCU_WIDTH_C, LCU_WIDTH_C);
-    uvg_pixels_blit(lcu->rec.v + offset, lcu->rec.joint_v + offset, width / 2, width / 2, LCU_WIDTH_C, LCU_WIDTH_C);
+    uvg_pixels_blit(lcu->rec.u + offset, lcu->rec.joint_u + offset, cu_loc->chroma_width, cu_loc->chroma_height, LCU_WIDTH_C, LCU_WIDTH_C);
+    uvg_pixels_blit(lcu->rec.v + offset, lcu->rec.joint_v + offset, cu_loc->chroma_width, cu_loc->chroma_height, LCU_WIDTH_C, LCU_WIDTH_C);
   }
 }
 
@@ -915,14 +899,12 @@ static bool is_b0_cand_coded(int x, int y, int width, int height)
  * \param ref_idx   index in the reference list
  * \param cand_out  will be filled with C0 and C1 candidates
  */
-static void get_temporal_merge_candidates(const encoder_state_t * const state,
-                                          int32_t x,
-                                          int32_t y,
-                                          int32_t width,
-                                          int32_t height,
-                                          uint8_t ref_list,
-                                          uint8_t ref_idx,
-                                          merge_candidates_t *cand_out)
+static void get_temporal_merge_candidates(
+  const encoder_state_t * const state,
+  const cu_loc_t* const cu_loc,
+  uint8_t ref_list,
+  uint8_t ref_idx,
+  merge_candidates_t *cand_out)
 {
   /*
   Predictor block locations
@@ -951,8 +933,8 @@ static void get_temporal_merge_candidates(const encoder_state_t * const state,
     cu_array_t *ref_cu_array = state->frame->ref->cu_arrays[colocated_ref];
     int cu_per_width = ref_cu_array->width / SCU_WIDTH;
 
-    int32_t xColBr = x + width;
-    int32_t yColBr = y + height;
+    int32_t xColBr = cu_loc->x + cu_loc->width;
+    int32_t yColBr = cu_loc->y + cu_loc->height;
 
     // C0 must be available
     if (xColBr < state->encoder_control->in.width &&
@@ -972,8 +954,8 @@ static void get_temporal_merge_candidates(const encoder_state_t * const state,
         }
       }
     }
-    int32_t xColCtr = x + (width / 2);
-    int32_t yColCtr = y + (height / 2);
+    int32_t xColCtr = cu_loc->x + (cu_loc->width / 2);
+    int32_t yColCtr = cu_loc->y + (cu_loc->height / 2);
 
     // C1 must be inside the LCU, in the center position of current CU
     if (xColCtr < state->encoder_control->in.width && yColCtr < state->encoder_control->in.height) {
@@ -1254,10 +1236,7 @@ static void get_ibc_merge_candidates(const encoder_state_t * const state,
  * \param lcu             current LCU
  * \param cand_out        will be filled with A and B candidates
  */
-static void get_spatial_merge_candidates(int32_t x,
-                                         int32_t y,
-                                         int32_t width,
-                                         int32_t height,
+static void get_spatial_merge_candidates(const cu_loc_t* const cu_loc,
                                          int32_t picture_width,
                                          int32_t picture_height,
                                          lcu_t *lcu,
@@ -1276,8 +1255,13 @@ static void get_spatial_merge_candidates(int32_t x,
   |A1|_________|
   |A0|
   */
-  int32_t x_local = SUB_SCU(x); //!< coordinates from top-left of this LCU
-  int32_t y_local = SUB_SCU(y);
+  const int32_t x_local = SUB_SCU(cu_loc->x); //!< coordinates from top-left of this LCU
+  const int32_t y_local = SUB_SCU(cu_loc->y);
+
+  const int x = cu_loc->x;
+  const int y = cu_loc->y;
+  const int width = cu_loc->width;
+  const int height = cu_loc->height;
   // A0 and A1 availability testing
   if (x != 0) {
     cu_info_t *a1 = LCU_GET_CU_AT_PX(lcu, x_local - 1, y_local + height - 1);
@@ -1350,15 +1334,13 @@ static void get_spatial_merge_candidates(int32_t x,
  * \param picture_height  tile height in pixels
  * \param cand_out        will be filled with A and B candidates
  */
-static void get_spatial_merge_candidates_cua(const cu_array_t *cua,
-                                             int32_t x,
-                                             int32_t y,
-                                             int32_t width,
-                                             int32_t height,
-                                             int32_t picture_width,
-                                             int32_t picture_height,
-                                             merge_candidates_t *cand_out,
-                                             bool wpp)
+static void get_spatial_merge_candidates_cua(
+  const cu_array_t *cua,
+  int32_t picture_width,
+  int32_t picture_height,
+  merge_candidates_t *cand_out,
+  bool wpp,
+  const cu_loc_t* const cu_loc)
 {
   /*
   Predictor block locations
@@ -1370,8 +1352,12 @@ static void get_spatial_merge_candidates_cua(const cu_array_t *cua,
   |A1|_________|
   |A0|
   */
-  int32_t x_local = SUB_SCU(x); //!< coordinates from top-left of this LCU
-  int32_t y_local = SUB_SCU(y);
+  const int x = cu_loc->x;
+  const int y = cu_loc->y;
+  const int width = cu_loc->width;
+  const int height = cu_loc->height;
+  const int32_t x_local = SUB_SCU(x); //!< coordinates from top-left of this LCU
+  const int32_t y_local = SUB_SCU(y);
   // A0 and A1 availability testing
   if (x != 0) {
     const cu_info_t *a1 = uvg_cu_array_at_const(cua, x - 1, y + height - 1);
@@ -1484,15 +1470,13 @@ static bool add_temporal_candidate(const encoder_state_t *state,
 /**
  * \brief Pick two mv candidates from the spatial and temporal candidates.
  */
-static void get_mv_cand_from_candidates(const encoder_state_t * const state,
-                                        int32_t x,
-                                        int32_t y,
-                                        int32_t width,
-                                        int32_t height,
-                                        const merge_candidates_t *merge_cand,
-                                        const cu_info_t * const cur_cu,
-                                        int8_t reflist,
-                                        mv_t mv_cand[2][2])
+static void get_mv_cand_from_candidates(
+  const encoder_state_t * const state,
+  const merge_candidates_t *merge_cand,
+  const cu_info_t * const cur_cu,
+  int8_t reflist,
+  mv_t mv_cand[2][2],
+  int ctu_row)
 {
   const cu_info_t *const *a = merge_cand->a;
   const cu_info_t *const *b = merge_cand->b;
@@ -1552,7 +1536,6 @@ static void get_mv_cand_from_candidates(const encoder_state_t * const state,
 
   if (candidates < AMVP_MAX_NUM_CANDS)
   {
-    const uint32_t ctu_row = (y >> LOG2_LCU_WIDTH);
     const uint32_t ctu_row_mul_five = ctu_row * MAX_NUM_HMVP_CANDS;
     int32_t num_cand = state->tile->frame->hmvp_size[ctu_row];
     for (int i = 0; i < MIN(/*MAX_NUM_HMVP_AVMPCANDS*/4,num_cand); i++) {
@@ -1595,32 +1578,30 @@ static void get_mv_cand_from_candidates(const encoder_state_t * const state,
  * \param lcu       current LCU
  * \param reflist   reflist index (either 0 or 1)
  */
-void uvg_inter_get_mv_cand(const encoder_state_t * const state,
-                           int32_t x,
-                           int32_t y,
-                           int32_t width,
-                           int32_t height,
-                           mv_t mv_cand[2][2],
-                           const cu_info_t  * const cur_cu,
-                           lcu_t *lcu,
-                           int8_t reflist)
+void uvg_inter_get_mv_cand(
+  const encoder_state_t * const state,
+  mv_t mv_cand[2][2],
+  const cu_info_t  * const cur_cu,
+  lcu_t *lcu,
+  int8_t reflist,
+  const cu_loc_t* const cu_loc)
 {
   merge_candidates_t merge_cand = { 0 };
   const uint8_t parallel_merge_level = state->encoder_control->cfg.log2_parallel_merge_level;
   if (cur_cu->type == CU_IBC) {
     mv_t ibc_mv_cand[IBC_MRG_MAX_NUM_CANDS][2];
-    get_ibc_merge_candidates(state, cur_cu,lcu,NULL, x, y, width, height,ibc_mv_cand);
+    get_ibc_merge_candidates(state, cur_cu,lcu,NULL, cu_loc->x, cu_loc->y, cu_loc->width, cu_loc->height,ibc_mv_cand);
     memcpy(mv_cand[0], ibc_mv_cand[0], sizeof(mv_t) * 2);
     memcpy(mv_cand[1], ibc_mv_cand[1], sizeof(mv_t) * 2);
-  } else {
-    get_spatial_merge_candidates(x, y, width, height,
-                                 state->tile->frame->width,
-                                 state->tile->frame->height,
-                                 lcu,
-                                 &merge_cand, parallel_merge_level,state->encoder_control->cfg.wpp);
-    get_temporal_merge_candidates(state, x, y, width, height, 1, 0, &merge_cand);
-    get_mv_cand_from_candidates(state, x, y, width, height, &merge_cand, cur_cu, reflist, mv_cand);
+  } else { 
+    get_spatial_merge_candidates(cu_loc, state->tile->frame->width, state->tile->frame->height, lcu,
+                                 &merge_cand,
+                                 parallel_merge_level,
+                                 state->encoder_control->cfg.wpp);
+    get_temporal_merge_candidates(state, cu_loc, 1, 0, &merge_cand);
+    get_mv_cand_from_candidates(state, &merge_cand, cur_cu, reflist, mv_cand, cu_loc->y >> LOG2_LCU_WIDTH);
   }
+    
   uvg_round_precision(INTERNAL_MV_PREC, 2, &mv_cand[0][0], &mv_cand[0][1]);
   uvg_round_precision(INTERNAL_MV_PREC, 2, &mv_cand[1][0], &mv_cand[1][1]);
 }
@@ -1637,31 +1618,29 @@ void uvg_inter_get_mv_cand(const encoder_state_t * const state,
  * \param cur_cu    current CU
  * \param reflist   reflist index (either 0 or 1)
  */
-void uvg_inter_get_mv_cand_cua(const encoder_state_t * const state,
-                               int32_t x,
-                               int32_t y,
-                               int32_t width,
-                               int32_t height,
-                               mv_t mv_cand[2][2],
-                               const cu_info_t* cur_cu,
-                               int8_t reflist)
+void uvg_inter_get_mv_cand_cua(
+  const encoder_state_t * const state,
+  mv_t mv_cand[2][2],
+  const cu_info_t* cur_cu,
+  int8_t reflist,
+  const cu_loc_t* const cu_loc)
 {
   merge_candidates_t merge_cand = { 0 };
 
   const cu_array_t *cua = state->tile->frame->cu_array;
   if (cur_cu->type == CU_IBC) {
     mv_t ibc_mv_cand[IBC_MRG_MAX_NUM_CANDS][2];
-    get_ibc_merge_candidates(state, cur_cu, NULL,cua,x, y, width, height,ibc_mv_cand);
+    get_ibc_merge_candidates(state, cur_cu, NULL,cua,cu_loc->x, cu_loc->y, cu_loc->width, cu_loc->height,ibc_mv_cand);
     memcpy(mv_cand[0], ibc_mv_cand[0], sizeof(mv_t) * 2);
     memcpy(mv_cand[1], ibc_mv_cand[1], sizeof(mv_t) * 2);    
   } else {
     get_spatial_merge_candidates_cua(cua,
-                                     x, y, width, height,
-                                     state->tile->frame->width, state->tile->frame->height,
-                                     &merge_cand, state->encoder_control->cfg.wpp);
-    get_temporal_merge_candidates(state, x, y, width, height, 1, 0, &merge_cand);
-    get_mv_cand_from_candidates(state, x, y, width, height, &merge_cand, cur_cu, reflist, mv_cand);
+                                     state->tile->frame->width, state->tile->frame->height, &merge_cand, state->encoder_control->cfg.wpp,
+                                     cu_loc);
+    get_temporal_merge_candidates(state, cu_loc, 1, 0, &merge_cand);
+    get_mv_cand_from_candidates(state, &merge_cand, cur_cu, reflist, mv_cand, cu_loc->y >> LOG2_LCU_WIDTH);
   }
+
   uvg_round_precision(INTERNAL_MV_PREC, 2, &mv_cand[0][0], &mv_cand[0][1]);
   uvg_round_precision(INTERNAL_MV_PREC, 2, &mv_cand[1][0], &mv_cand[1][1]);
 }
@@ -1885,23 +1864,23 @@ void uvg_round_precision_vector2d(int src, int dst, vector2d_t* mv) {
  * \param lcu       lcu containing the block
  * \return          number of merge candidates
  */
-uint8_t uvg_inter_get_merge_cand(const encoder_state_t * const state,
-                                 int32_t x, int32_t y,
-                                 int32_t width, int32_t height,
-                                 bool use_a1, bool use_b1,
-                                 inter_merge_cand_t mv_cand[MRG_MAX_NUM_CANDS],
-                                 lcu_t *lcu)
+uint8_t uvg_inter_get_merge_cand(
+  const encoder_state_t * const state,
+  const cu_loc_t* const cu_loc,
+  inter_merge_cand_t mv_cand[MRG_MAX_NUM_CANDS],
+  lcu_t *lcu)
 {
   uint8_t candidates = 0;
   int8_t zero_idx = 0;
   const uint8_t parallel_merge_level = state->encoder_control->cfg.log2_parallel_merge_level;
   merge_candidates_t merge_cand = { 0 };
   const uint8_t max_num_cands = state->encoder_control->cfg.max_merge;
+  // Current CU
+  cu_info_t         *cur_cu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(cu_loc->x), SUB_SCU(cu_loc->y));
 
-  cu_info_t         *cur_cu        = LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y));
   if(cur_cu->type == CU_IBC) {
     mv_t ibc_mv_cand[IBC_MRG_MAX_NUM_CANDS][2];
-    get_ibc_merge_candidates(state, cur_cu,lcu,NULL, x, y, width, height,ibc_mv_cand);
+    get_ibc_merge_candidates(state, cur_cu,lcu,NULL, cu_loc->x, cu_loc->y, cu_loc->width, cu_loc->height,ibc_mv_cand);
     for (int i = 0; i < IBC_MRG_MAX_NUM_CANDS; i++) {
       mv_cand[i].dir = 1;
       mv_cand[i].mv[0][0] = ibc_mv_cand[i][0];
@@ -1909,18 +1888,16 @@ uint8_t uvg_inter_get_merge_cand(const encoder_state_t * const state,
     }
     return IBC_MRG_MAX_NUM_CANDS;
   }
-
-  get_spatial_merge_candidates(x, y, width, height,
-                               state->tile->frame->width,
-                               state->tile->frame->height,
-                               lcu,
-                               &merge_cand, parallel_merge_level, state->encoder_control->cfg.wpp);
+  get_spatial_merge_candidates(cu_loc, state->tile->frame->width, state->tile->frame->height, lcu,
+                               &merge_cand,
+                               parallel_merge_level,
+                               state->encoder_control->cfg.wpp);
 
   const cu_info_t **a = merge_cand.a;
   const cu_info_t **b = merge_cand.b;
 
-  if (!use_a1) a[1] = NULL;
-  if (!use_b1) b[1] = NULL;
+  const int x = cu_loc->x;
+  const int y = cu_loc->y;
 
   if (different_mer(x, y, x, y - 1, parallel_merge_level) && add_merge_candidate(b[1], NULL, NULL, &mv_cand[candidates])) candidates++;
   if (different_mer(x, y, x - 1, y, parallel_merge_level) && add_merge_candidate(a[1], b[1], NULL, &mv_cand[candidates])) candidates++;
@@ -1941,7 +1918,7 @@ uint8_t uvg_inter_get_merge_cand(const encoder_state_t * const state,
     for (int reflist = 0; reflist <= max_reflist; reflist++) {
       // Fetch temporal candidates for the current CU
       // ToDo: change collocated_from_l0_flag to allow L1 ref
-      get_temporal_merge_candidates(state, x, y, width, height, 1, 0, &merge_cand);
+      get_temporal_merge_candidates(state, cu_loc, 1, 0, &merge_cand);
       // TODO: enable L1 TMVP candidate
       // get_temporal_merge_candidates(state, x, y, width, height, 2, 0, &merge_cand);
 
@@ -1973,7 +1950,7 @@ uint8_t uvg_inter_get_merge_cand(const encoder_state_t * const state,
   if (candidates == max_num_cands) return candidates;
 
   if (candidates != max_num_cands - 1) {
-    const uint32_t ctu_row = (y >> LOG2_LCU_WIDTH);
+    const uint32_t ctu_row = (cu_loc->y >> LOG2_LCU_WIDTH);
     const uint32_t ctu_row_mul_five = ctu_row * MAX_NUM_HMVP_CANDS;
     int32_t num_cand = state->tile->frame->hmvp_size[ctu_row];
 
diff --git a/src/inter.h b/src/inter.h
index 45f5e5ea..4d5fccd5 100644
--- a/src/inter.h
+++ b/src/inter.h
@@ -58,61 +58,51 @@ void uvg_change_precision_vector2d(int src, int dst, vector2d_t* mv);
 void uvg_round_precision(int src, int dst, mv_t* hor, mv_t* ver);
 void uvg_round_precision_vector2d(int src, int dst, vector2d_t* mv);
 
-void uvg_inter_recon_cu(const encoder_state_t * const state,
-                        lcu_t *lcu,
-                        int32_t x,
-                        int32_t y,
-                        int32_t width,
-                        bool predict_luma,
-                        bool predict_chroma);
-
-void uvg_inter_pred_pu(const encoder_state_t * const state,
+void uvg_inter_recon_cu(
+  const encoder_state_t * const state,
   lcu_t *lcu,
-  int32_t x,
-  int32_t y,
-  int32_t width,
   bool predict_luma,
   bool predict_chroma,
-  int i_pu);
+  const cu_loc_t* const cu_loc);
+
+void uvg_inter_pred_pu(
+  const encoder_state_t * const state,
+  lcu_t *lcu,
+  bool predict_luma,
+  bool predict_chroma,
+  const cu_loc_t* const cu_loc);
 
 void uvg_hmvp_add_mv(const encoder_state_t* const state, uint32_t pic_x, uint32_t pic_y, uint32_t block_width, uint32_t block_height, const cu_info_t* cu);
 
-void uvg_inter_recon_bipred(const encoder_state_t * const state,
-                            const uvg_picture * ref1,
-                            const uvg_picture * ref2,
-                            int32_t xpos,
-                            int32_t ypos,
-                            int32_t width,
-                            int32_t height,
-                            mv_t mv_param[2][2],
-                            lcu_t* lcu,
-                            bool predict_luma,
-                            bool predict_chroma);
+void uvg_inter_recon_bipred(
+  const encoder_state_t * const state,
+  const uvg_picture * ref1,
+  const uvg_picture * ref2,
+  mv_t mv_param[2][2],
+  lcu_t* lcu,
+  bool predict_luma,
+  bool predict_chroma,
+  const cu_loc_t* const cu_loc);
 
 
-void uvg_inter_get_mv_cand(const encoder_state_t * const state,
-                           int32_t x,
-                           int32_t y,
-                           int32_t width,
-                           int32_t height,
-                           mv_t mv_cand[2][2],
-                           const cu_info_t* cur_cu,
-                           lcu_t *lcu,
-                           int8_t reflist);
+void uvg_inter_get_mv_cand(
+  const encoder_state_t * const state,
+  mv_t mv_cand[2][2],
+  const cu_info_t* cur_cu,
+  lcu_t *lcu,
+  int8_t reflist,
+  const cu_loc_t* const cu_loc);
 
-void uvg_inter_get_mv_cand_cua(const encoder_state_t * const state,
-                               int32_t x,
-                               int32_t y,
-                               int32_t width,
-                               int32_t height,
-                               mv_t mv_cand[2][2],
-                               const cu_info_t* cur_cu,
-                               int8_t reflist);
+void uvg_inter_get_mv_cand_cua(
+  const encoder_state_t * const state,
+  mv_t mv_cand[2][2],
+  const cu_info_t* cur_cu,
+  int8_t reflist,
+  const cu_loc_t* const cu_loc);
 
-uint8_t uvg_inter_get_merge_cand(const encoder_state_t * const state,
-                                 int32_t x, int32_t y,
-                                 int32_t width, int32_t height,
-                                 bool use_a1, bool use_b1,
-                                 inter_merge_cand_t mv_cand[MRG_MAX_NUM_CANDS],
-                                 lcu_t *lcu);
+uint8_t uvg_inter_get_merge_cand(
+  const encoder_state_t * const state,
+  const cu_loc_t* const cu_loc,
+  inter_merge_cand_t mv_cand[MRG_MAX_NUM_CANDS],
+  lcu_t *lcu);
 #endif
diff --git a/src/search.c b/src/search.c
index 0b51412b..d61be039 100644
--- a/src/search.c
+++ b/src/search.c
@@ -166,7 +166,6 @@ static void lcu_fill_cu_info(lcu_t *lcu, int x_local, int y_local, int width, in
       cu_info_t *to = LCU_GET_CU_AT_PX(lcu, x, y);
       to->type      = cu->type;
       to->depth     = cu->depth;
-      to->part_size = cu->part_size;
       to->qp        = cu->qp;
       //to->tr_idx    = cu->tr_idx;
       to->lfnst_idx = cu->lfnst_idx;
@@ -191,22 +190,6 @@ static void lcu_fill_cu_info(lcu_t *lcu, int x_local, int y_local, int width, in
   }
 }
 
-static void lcu_fill_inter(lcu_t *lcu, int x_local, int y_local, int cu_width, uint8_t type)
-{
-  const part_mode_t part_mode = LCU_GET_CU_AT_PX(lcu, x_local, y_local)->part_size;
-  const int num_pu = uvg_part_mode_num_parts[part_mode];
-
-  for (int i = 0; i < num_pu; ++i) {
-    const int x_pu      = PU_GET_X(part_mode, cu_width, x_local, i);
-    const int y_pu      = PU_GET_Y(part_mode, cu_width, y_local, i);
-    const int width_pu  = PU_GET_W(part_mode, cu_width, i);
-    const int height_pu = PU_GET_H(part_mode, cu_width, i);
-
-    cu_info_t *pu  = LCU_GET_CU_AT_PX(lcu, x_pu, y_pu);
-    pu->type = type;
-    lcu_fill_cu_info(lcu, x_pu, y_pu, width_pu, height_pu, pu);
-  }
-}
 
 static void lcu_fill_cbf(lcu_t *lcu, int x_local, unsigned y_local, unsigned width, const cu_info_t *cur_cu)
 {
@@ -559,7 +542,7 @@ static double cu_rd_cost_tr_split_accurate(
     int cbf = cbf_is_set_any(pred_cu->cbf, depth);
     // Only need to signal coded block flag if not skipped or merged
     // skip = no coded residual, merge = coded residual
-    if (pred_cu->type != CU_INTRA && (pred_cu->part_size != SIZE_2Nx2N || !pred_cu->merged)) {
+    if (pred_cu->type != CU_INTRA && (!pred_cu->merged)) {
       CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_qt_root_cbf_model), cbf, tr_tree_bits, "rqt_root_cbf");
     }
 
@@ -876,18 +859,20 @@ void uvg_sort_keys_by_cost(unit_stats_map_t *__restrict map)
  */
 static double search_cu(
   encoder_state_t* const state,
-  int x,
-  int y,
-  int depth,
+  const cu_loc_t* const cu_loc,
   lcu_t* work_tree,
   enum uvg_tree_type
-  tree_type)
+  tree_type,
+  const split_tree_t split_tree)
 {
+  const int depth = split_tree.current_depth;
   const encoder_control_t* ctrl = state->encoder_control;
   const videoframe_t * const frame = state->tile->frame;
-  const int cu_width = tree_type != UVG_CHROMA_T ? LCU_WIDTH >> depth : LCU_WIDTH_C >> depth;
-  const int cu_height = cu_width; // TODO: height
-  const int luma_width = LCU_WIDTH >> depth;
+  const int cu_width = tree_type != UVG_CHROMA_T ? cu_loc->width : cu_loc->chroma_width;
+  const int cu_height = tree_type != UVG_CHROMA_T ? cu_loc->height : cu_loc->chroma_height;
+  const int x = cu_loc->x;
+  const int y = cu_loc->y;
+  const int luma_width = cu_loc->width;
   assert(cu_width >= 4);
   double cost = MAX_DOUBLE;
   double inter_zero_coeff_cost = MAX_DOUBLE;
@@ -896,7 +881,7 @@ static double search_cu(
   cabac_data_t pre_search_cabac;
   memcpy(&pre_search_cabac, &state->search_cabac, sizeof(pre_search_cabac));
 
-  const uint32_t ctu_row = (y >> LOG2_LCU_WIDTH);
+  const uint32_t ctu_row = (cu_loc->y >> LOG2_LCU_WIDTH);
   const uint32_t ctu_row_mul_five = ctu_row * MAX_NUM_HMVP_CANDS;
 
   cu_info_t hmvp_lut[MAX_NUM_HMVP_CANDS];
@@ -913,7 +898,7 @@ static double search_cu(
     int32_t max;
   } pu_depth_inter, pu_depth_intra;
 
-  lcu_t *const lcu = &work_tree[depth];
+  lcu_t *const lcu = &work_tree[split_tree.current_depth];
 
   int x_local = SUB_SCU(x) >> (tree_type == UVG_CHROMA_T);
   int y_local = SUB_SCU(y) >> (tree_type == UVG_CHROMA_T);
@@ -947,10 +932,9 @@ static double search_cu(
 
   cur_cu = LCU_GET_CU_AT_PX(lcu, x_local, y_local);
   // Assign correct depth
-  cur_cu->depth = (depth > MAX_DEPTH) ? MAX_DEPTH : depth;
-  cur_cu->tr_depth = (depth > 0) ? depth : 1;
+  cur_cu->depth = (split_tree.current_depth > MAX_DEPTH) ? MAX_DEPTH : split_tree.current_depth;
+  cur_cu->tr_depth = cu_width > TR_MAX_WIDTH || cu_height > TR_MAX_WIDTH ? 1 : split_tree.current_depth;
   cur_cu->type = CU_NOTSET;
-  cur_cu->part_size = SIZE_2Nx2N;
   cur_cu->qp = state->qp;
   cur_cu->bdpcmMode = 0;
   cur_cu->tr_idx = 0;
@@ -969,9 +953,9 @@ static double search_cu(
     int cu_width_inter_min = LCU_WIDTH >> pu_depth_inter.max;
     bool can_use_inter =
       state->frame->slicetype != UVG_SLICE_I &&
-      depth <= MAX_DEPTH &&
+      split_tree.current_depth <= MAX_DEPTH &&
       (
-        WITHIN(depth, pu_depth_inter.min, pu_depth_inter.max) ||
+        WITHIN(split_tree.current_depth, pu_depth_inter.min, pu_depth_inter.max) ||
         // When the split was forced because the CTU is partially outside the
         // frame, we permit inter coding even if pu_depth_inter would
         // otherwise forbid it.
@@ -983,10 +967,9 @@ static double search_cu(
       double mode_cost;
       double mode_bitcost;
       uvg_search_cu_inter(state,
-                          x, y,
-                          depth,
-                          lcu,
-                          &mode_cost, &mode_bitcost);
+                          cu_loc, lcu,
+                          &mode_cost,
+                          &mode_bitcost);
       if (mode_cost < cost) {
         cost = mode_cost;
         inter_bitcost = mode_bitcost;
@@ -1004,7 +987,7 @@ static double search_cu(
 
     int32_t cu_width_intra_min = LCU_WIDTH >> pu_depth_intra.max;
     bool can_use_intra =
-      (WITHIN(depth, pu_depth_intra.min, pu_depth_intra.max) ||
+      (WITHIN(split_tree.current_depth, pu_depth_intra.min, pu_depth_intra.max) ||
         // When the split was forced because the CTU is partially outside
         // the frame, we permit intra coding even if pu_depth_intra would
         // otherwise forbid it.
@@ -1048,7 +1031,7 @@ static double search_cu(
         int8_t intra_mode = intra_search.pred_cu.intra.mode;
 
         // TODO: This heavily relies to square CUs
-        if ((depth != 4 || (x % 8 && y % 8)) && state->encoder_control->chroma_format != UVG_CSP_400 && tree_type != UVG_LUMA_T) {
+        if ((split_tree.current_depth != 4 || (x % 8 && y % 8)) && state->encoder_control->chroma_format != UVG_CSP_400 && tree_type != UVG_LUMA_T) {
 
           intra_search.pred_cu.joint_cb_cr = 0;
           // There is almost no benefit to doing the chroma mode search for
@@ -1097,7 +1080,7 @@ static double search_cu(
         }
         intra_search.pred_cu.intra.mode = intra_mode;
         if(tree_type == UVG_CHROMA_T) {
-          uvg_lcu_fill_trdepth(lcu, x_local, y_local, depth, depth, tree_type);
+          uvg_lcu_fill_trdepth(lcu, x_local, y_local, split_tree.current_depth, split_tree.current_depth, tree_type);
         }
       }
       if (intra_cost < cost) {
@@ -1120,8 +1103,7 @@ static double search_cu(
       double mode_cost;
       double mode_bitcost;
       uvg_search_cu_ibc(state,
-                        x, y,
-                        depth,
+                        cu_loc,
                         lcu,
                         &mode_cost, &mode_bitcost);
       if (mode_cost < cost) {
@@ -1138,11 +1120,10 @@ static double search_cu(
     // Reconstruct best mode because we need the reconstructed pixels for
     // mode search of adjacent CUs.
     if (cur_cu->type == CU_INTRA) {
-      assert(cur_cu->part_size == SIZE_2Nx2N || cur_cu->part_size == SIZE_NxN);
 
       bool recon_chroma = true;
       bool recon_luma = tree_type != UVG_CHROMA_T;
-      if ((depth == 4) || state->encoder_control->chroma_format == UVG_CSP_400 || tree_type == UVG_LUMA_T) {
+      if ((split_tree.current_depth == 4) || state->encoder_control->chroma_format == UVG_CSP_400 || tree_type == UVG_LUMA_T) {
         recon_chroma = false; 
       }
       lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu);
@@ -1153,7 +1134,7 @@ static double search_cu(
                          lcu, tree_type,recon_luma,recon_chroma);
 
 
-      if(depth == 4 && x % 8 && y % 8 && tree_type != UVG_LUMA_T && state->encoder_control->chroma_format != UVG_CSP_400) {
+      if(split_tree.current_depth == 4 && x % 8 && y % 8 && tree_type != UVG_LUMA_T && state->encoder_control->chroma_format != UVG_CSP_400) {
         intra_search.pred_cu.intra.mode_chroma = cur_cu->intra.mode_chroma;
         uvg_intra_recon_cu(state,
                            x, y,
@@ -1168,8 +1149,8 @@ static double search_cu(
       const int split_type = intra_search.pred_cu.intra.isp_mode;
       const int split_num = split_type == ISP_MODE_NO_ISP ? 0 : uvg_get_isp_split_num(cu_width, cu_height, split_type, true);
 
-      const int cbf_cb = cbf_is_set(cur_cu->cbf, depth, COLOR_U);
-      const int cbf_cr = cbf_is_set(cur_cu->cbf, depth, COLOR_V);
+      const int cbf_cb = cbf_is_set(cur_cu->cbf, split_tree.current_depth, COLOR_U);
+      const int cbf_cr = cbf_is_set(cur_cu->cbf, split_tree.current_depth, COLOR_V);
       const int jccr = cur_cu->joint_cb_cr;
       for (int i = 0; i < split_num; ++i) {
         cu_loc_t isp_loc;
@@ -1181,15 +1162,14 @@ static double search_cu(
         uvg_get_isp_cu_arr_coords(&tmp_x, &tmp_y);
         cu_info_t* split_cu = LCU_GET_CU_AT_PX(lcu, tmp_x % LCU_WIDTH, tmp_y % LCU_WIDTH);
         bool cur_cbf = (intra_search.best_isp_cbfs >> i) & 1;
-        // ISP_TODO: here, cbfs are also set for chroma for all ISP splits, is this behavior wanted?
-        cbf_clear(&split_cu->cbf, depth, COLOR_Y);
-        cbf_clear(&split_cu->cbf, depth, COLOR_U);
-        cbf_clear(&split_cu->cbf, depth, COLOR_V);
+        cbf_clear(&split_cu->cbf, split_tree.current_depth, COLOR_Y);
+        cbf_clear(&split_cu->cbf, split_tree.current_depth, COLOR_U);
+        cbf_clear(&split_cu->cbf, split_tree.current_depth, COLOR_V);
         if (cur_cbf) {
-          cbf_set(&split_cu->cbf, depth, COLOR_Y);
+          cbf_set(&split_cu->cbf, split_tree.current_depth, COLOR_Y);
         }
-        if(cbf_cb) cbf_set(&split_cu->cbf, depth, COLOR_U);
-        if(cbf_cr) cbf_set(&split_cu->cbf, depth, COLOR_V);
+        if(cbf_cb) cbf_set(&split_cu->cbf, split_tree.current_depth, COLOR_U);
+        if(cbf_cr) cbf_set(&split_cu->cbf, split_tree.current_depth, COLOR_V);
         split_cu->joint_cb_cr = jccr;
       }
       lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu);
@@ -1205,24 +1185,20 @@ static double search_cu(
         }
         // Reset transform depth because intra messes with them.
         // This will no longer be necessary if the transform depths are not shared.
-        int tr_depth = MAX(1, depth);
-        if (cur_cu->part_size != SIZE_2Nx2N) {
-          tr_depth = depth + 1;
-        }
+        int tr_depth = MAX(1, split_tree.current_depth);
+
         uvg_lcu_fill_trdepth(lcu, x, y, depth, tr_depth, tree_type);
 
         const bool has_chroma = state->encoder_control->chroma_format != UVG_CSP_400;
-        uvg_inter_recon_cu(state, lcu, x, y, cu_width, true, has_chroma);
+        uvg_inter_recon_cu(state, lcu, true, has_chroma, cu_loc);
 
         if (ctrl->cfg.zero_coeff_rdo && !ctrl->cfg.lossless && !ctrl->cfg.rdoq_enable) {
           //Calculate cost for zero coeffs
-          inter_zero_coeff_cost = cu_zero_coeff_cost(state, work_tree, x, y, depth) + inter_bitcost * state->lambda;
+          inter_zero_coeff_cost = cu_zero_coeff_cost(state, work_tree, x, y, split_tree.current_depth) + inter_bitcost * state->lambda;
 
         }
         cu_loc_t loc;
-        const int width = LCU_WIDTH >> depth;
-        const int height = width; // TODO: height for non-square blocks
-        uvg_cu_loc_ctor(&loc, x, y, width, height);
+        uvg_cu_loc_ctor(&loc, x, y, cu_width, cu_height);
         uvg_quantize_lcu_residual(state,
                                   true, has_chroma && !cur_cu->joint_cb_cr,
                                   cur_cu->joint_cb_cr, &loc,
@@ -1232,9 +1208,9 @@ static double search_cu(
                                   false,
           tree_type);
 
-        int cbf = cbf_is_set_any(cur_cu->cbf, depth);
+        int cbf = cbf_is_set_any(cur_cu->cbf, split_tree.current_depth);
 
-        if (cur_cu->merged && !cbf && cur_cu->part_size == SIZE_2Nx2N) {
+        if (cur_cu->merged && !cbf) {
           cur_cu->merged = 0;
           cur_cu->skipped = 1;
           // Selecting skip reduces bits needed to code the CU
@@ -1244,7 +1220,7 @@ static double search_cu(
           inter_bitcost += cur_cu->merge_idx;        
         }
       }
-      lcu_fill_inter(lcu, x_local, y_local, cu_width, cur_cu->type);
+      lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_height, cur_cu);
       lcu_fill_cbf(lcu, x_local, y_local, cu_width, cur_cu);
     }
   }
@@ -1253,19 +1229,13 @@ static double search_cu(
     double bits = 0;
     cabac_data_t* cabac  = &state->search_cabac;
     cabac->update = 1;
+    
+    bits += uvg_mock_encode_coding_unit(
+      state,
+      cabac,
+      cu_loc, lcu, cur_cu,
+      tree_type);
 
-    if(cur_cu->type != CU_INTRA || cur_cu->part_size == SIZE_2Nx2N) {
-      bits += uvg_mock_encode_coding_unit(
-        state,
-        cabac,
-        x, y, depth,
-        lcu,
-        cur_cu,
-        tree_type);
-    }
-    else {
-      assert(0);
-    }
     
     cost = bits * state->lambda;
 
@@ -1275,15 +1245,15 @@ static double search_cu(
       cost = inter_zero_coeff_cost;
 
       // Restore saved pixels from lower level of the working tree.
-      copy_cu_pixels(x_local, y_local, cu_width, &work_tree[depth + 1], lcu, tree_type);
+      copy_cu_pixels(x_local, y_local, cu_width, &work_tree[split_tree.current_depth + 1], lcu, tree_type);
 
-      if (cur_cu->merged && cur_cu->part_size == SIZE_2Nx2N) {
+      if (cur_cu->merged) {
         cur_cu->merged = 0;
         cur_cu->skipped = 1;
         lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu);
       }
 
-      if (cur_cu->tr_depth != depth) {
+      if (cur_cu->tr_depth != 0) {
         // Reset transform depth since there are no coefficients. This
         // ensures that CBF is cleared for the whole area of the CU.
         uvg_lcu_fill_trdepth(lcu, x, y, depth, depth, tree_type);
@@ -1299,12 +1269,12 @@ static double search_cu(
     // If the CU is partially outside the frame, we need to split it even
     // if pu_depth_intra and pu_depth_inter would not permit it.
     cur_cu->type == CU_NOTSET ||
-    (depth < pu_depth_intra.max && !(state->encoder_control->cfg.force_inter&& state->frame->slicetype != UVG_SLICE_I)) ||
+    (split_tree.current_depth < pu_depth_intra.max && !(state->encoder_control->cfg.force_inter&& state->frame->slicetype != UVG_SLICE_I)) ||
     (state->frame->slicetype != UVG_SLICE_I &&
-      depth < pu_depth_inter.max);
+      split_tree.current_depth < pu_depth_inter.max);
 
   if(state->encoder_control->cabac_debug_file) {
-    fprintf(state->encoder_control->cabac_debug_file, "S %4d %4d %d %d", x, y, depth, tree_type);
+    fprintf(state->encoder_control->cabac_debug_file, "S %4d %4d %d %d", x, y, split_tree.current_depth, tree_type);
     fwrite(&state->search_cabac.ctx, 1,  sizeof(state->search_cabac.ctx), state->encoder_control->cabac_debug_file);
   }
 
@@ -1312,7 +1282,7 @@ static double search_cu(
   if (can_split_cu) {
     int half_cu = cu_width >> (tree_type != UVG_CHROMA_T);
     double split_cost = 0.0;
-    int cbf = cbf_is_set_any(cur_cu->cbf, depth);
+    int cbf = cbf_is_set_any(cur_cu->cbf, split_tree.current_depth);
     cabac_data_t post_seach_cabac;
     memcpy(&post_seach_cabac, &state->search_cabac, sizeof(post_seach_cabac));
     memcpy(&state->search_cabac, &pre_search_cabac, sizeof(post_seach_cabac));
@@ -1320,7 +1290,7 @@ static double search_cu(
 
     double split_bits = 0;
 
-    if (depth < MAX_DEPTH) {
+    if (split_tree.current_depth < MAX_DEPTH) {
 
       state->search_cabac.update = 1;
       // Add cost of cu_split_flag.
@@ -1364,10 +1334,24 @@ static double search_cu(
     // It is ok to interrupt the search as soon as it is known that
     // the split costs at least as much as not splitting.
     if (cur_cu->type == CU_NOTSET || cbf || state->encoder_control->cfg.cu_split_termination == UVG_CU_SPLIT_TERMINATION_OFF) {
-      if (split_cost < cost) split_cost += search_cu(state, x,           y,           depth + 1, work_tree, tree_type);
-      if (split_cost < cost) split_cost += search_cu(state, x + half_cu, y,           depth + 1, work_tree, tree_type);
-      if (split_cost < cost) split_cost += search_cu(state, x,           y + half_cu, depth + 1, work_tree, tree_type);
-      if (split_cost < cost) split_cost += search_cu(state, x + half_cu, y + half_cu, depth + 1, work_tree, tree_type);
+      const split_tree_t new_split = { split_tree.split_tree | QT_SPLIT << split_tree.current_depth, split_tree.current_depth + 1};
+      cu_loc_t new_cu_loc;
+      if (split_cost < cost) {
+        uvg_cu_loc_ctor(&new_cu_loc, x, y, half_cu, half_cu);
+        split_cost += search_cu(state, &new_cu_loc, work_tree, tree_type, new_split);
+      }
+      if (split_cost < cost) {
+        uvg_cu_loc_ctor(&new_cu_loc, x + half_cu, y, half_cu, half_cu);
+        split_cost += search_cu(state, &new_cu_loc, work_tree, tree_type, new_split);
+      }
+      if (split_cost < cost) {
+        uvg_cu_loc_ctor(&new_cu_loc, x, y + half_cu, half_cu, half_cu);
+        split_cost += search_cu(state, &new_cu_loc, work_tree, tree_type, new_split);
+      }
+      if (split_cost < cost) {
+        uvg_cu_loc_ctor(&new_cu_loc, x + half_cu, y + half_cu, half_cu, half_cu);
+        split_cost += search_cu(state, &new_cu_loc, work_tree, tree_type, new_split);
+      }
     } else {
       split_cost = INT_MAX;
     }
@@ -1401,7 +1385,6 @@ static double search_cu(
 
         cur_cu->intra = cu_d1->intra;
         cur_cu->type = CU_INTRA;
-        cur_cu->part_size = SIZE_2Nx2N;
 
         // Disable MRL in this case
         cur_cu->intra.multi_ref_idx = 0;
@@ -1687,14 +1670,17 @@ void uvg_search_lcu(encoder_state_t * const state, const int x, const int y, con
 
   int tree_type = state->frame->slicetype == UVG_SLICE_I
   && state->encoder_control->cfg.dual_tree ? UVG_LUMA_T : UVG_BOTH_T;
+
+  cu_loc_t start;
+  uvg_cu_loc_ctor(&start, x, y, LCU_WIDTH, LCU_WIDTH);
+  split_tree_t split_tree = { 0, 0 };
   // Start search from depth 0.
   double cost = search_cu(
-    state,
-    x,
-    y,
-    0,
+    state, 
+    &start,
     work_tree,
-    tree_type);
+    tree_type,
+    split_tree);
 
   // Save squared cost for rate control.
   if(state->encoder_control->cfg.rc_algorithm == UVG_LAMBDA) {
@@ -1710,12 +1696,9 @@ void uvg_search_lcu(encoder_state_t * const state, const int x, const int y, con
 
   if(state->frame->slicetype == UVG_SLICE_I && state->encoder_control->cfg.dual_tree) {
     cost = search_cu(
-      state,
-      x,
-      y,
-      0,
+      state, &start,
       work_tree,
-      UVG_CHROMA_T);
+      UVG_CHROMA_T, split_tree);
 
     if (state->encoder_control->cfg.rc_algorithm == UVG_LAMBDA) {
       uvg_get_lcu_stats(state, x / LCU_WIDTH, y / LCU_WIDTH)->weight += cost * cost;
diff --git a/src/search_ibc.c b/src/search_ibc.c
index 44f9ac50..b7067c8c 100644
--- a/src/search_ibc.c
+++ b/src/search_ibc.c
@@ -109,8 +109,10 @@ static INLINE bool fracmv_within_ibc_range(const ibc_search_info_t *info, int x,
 }
 
 
-static uint32_t calculate_ibc_cost_satd(const encoder_state_t *state, lcu_t* lcu, int32_t x, int32_t y, int32_t width, int32_t mv_x, int32_t mv_y)
+static uint32_t calculate_ibc_cost_satd(const encoder_state_t *state, lcu_t* lcu, const cu_loc_t* loc, int32_t mv_x, int32_t mv_y)
 {  
+  const uint32_t x = loc->x;
+  const uint32_t y = loc->y;
   const int x_scu    = SUB_SCU(x);
   const int y_scu    = SUB_SCU(y);
 
@@ -132,9 +134,11 @@ static uint32_t calculate_ibc_cost_satd(const encoder_state_t *state, lcu_t* lcu
   cur_cu->inter.mv[0][0]                  = mv_x * (1 << INTERNAL_MV_PREC);;
   cur_cu->inter.mv[0][1]                  = mv_y * (1 << INTERNAL_MV_PREC);;
 
-  uvg_inter_recon_cu(state, lcu, x, y, width, true, state->encoder_control->chroma_format != UVG_CSP_400);
+  uvg_inter_recon_cu(state, lcu, true, state->encoder_control->chroma_format != UVG_CSP_400, loc);
   
   *cur_cu = cu_backup;
+  uint32_t width = loc->width;
+  uint32_t height = loc->height;
 
   cost = uvg_satd_any_size(width,
                            width,
@@ -162,8 +166,10 @@ static uint32_t calculate_ibc_cost_satd(const encoder_state_t *state, lcu_t* lcu
 }
 
 
-static uint32_t calculate_ibc_cost_sad(const encoder_state_t *state, optimized_sad_func_ptr_t optimized_sad, lcu_t* lcu, int32_t x, int32_t y, int32_t width, int32_t mv_x, int32_t mv_y)
+static uint32_t calculate_ibc_cost_sad(const encoder_state_t *state, optimized_sad_func_ptr_t optimized_sad, lcu_t* lcu, const cu_loc_t* loc, int32_t mv_x, int32_t mv_y)
 {  
+  const uint32_t x = loc->x;
+  const uint32_t y = loc->y;
   cu_info_t *cur_cu    = LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y));
 
   cu_info_t cu_backup  = *cur_cu;
@@ -173,6 +179,8 @@ static uint32_t calculate_ibc_cost_sad(const encoder_state_t *state, optimized_s
   const int y_scu    = SUB_SCU(y);
   const uint32_t offset = x_scu + y_scu * LCU_WIDTH;
   const uint32_t offset_c = x_scu / 2 + y_scu / 2 * LCU_WIDTH_C;
+  const uint32_t width = loc->width;
+  const uint32_t height = loc->height;
 
   cur_cu->type    = CU_IBC;
   cur_cu->inter.mv_dir   = 1;
@@ -183,7 +191,7 @@ static uint32_t calculate_ibc_cost_sad(const encoder_state_t *state, optimized_s
   cur_cu->inter.mv[0][0]                  = mv_x * (1 << INTERNAL_MV_PREC);;
   cur_cu->inter.mv[0][1]                  = mv_y * (1 << INTERNAL_MV_PREC);;
 
-  uvg_inter_recon_cu(state, lcu, x, y, width, true, state->encoder_control->chroma_format != UVG_CSP_400);
+  uvg_inter_recon_cu(state, lcu, true, state->encoder_control->chroma_format != UVG_CSP_400, loc);
   
   *cur_cu = cu_backup;
 
@@ -235,8 +243,11 @@ static bool check_mv_cost(ibc_search_info_t *info,
 
   double bitcost = 0;
   double cost    = MAX_DOUBLE;
+  cu_loc_t loc;
+  uvg_cu_loc_ctor(&loc, info->origin.x, info->origin.y, info->width, info->height);
 
-  cost = calculate_ibc_cost_sad(info->state, info->optimized_sad, info->lcu, info->origin.x, info->origin.y, info->width, x, y);
+
+  cost = calculate_ibc_cost_sad(info->state, info->optimized_sad, info->lcu, &loc, x, y);
 
   if (cost >= *best_cost) return false;
 
@@ -246,7 +257,7 @@ static bool check_mv_cost(ibc_search_info_t *info,
       info->mv_cand,
       NULL,
       0,
-      NULL,
+      0,
       &bitcost
   );
 
@@ -782,63 +793,46 @@ static bool merge_candidate_in_list(inter_merge_cand_t *all_cands,
  * \param amvp        Return searched AMVP PUs sorted by costs
  * \param merge       Return searched Merge PUs sorted by costs
  */
-static void search_pu_ibc(encoder_state_t * const state,
-  int x_cu, int y_cu,
-  int depth,
-  part_mode_t part_mode,
-  int i_pu,
-  unit_stats_map_t *amvp,
-  unit_stats_map_t *merge,
-  ibc_search_info_t *info)
+static void search_pu_ibc(
+  encoder_state_t * const state,
+  const cu_loc_t * const  cu_loc,
+  unit_stats_map_t       *amvp,
+  unit_stats_map_t       *merge,
+  ibc_search_info_t      *info)
 {
-  const uvg_config *cfg = &state->encoder_control->cfg;
-  const videoframe_t * const frame = state->tile->frame;
-  const int width_cu = LCU_WIDTH >> depth;
-  const int x = PU_GET_X(part_mode, width_cu, x_cu, i_pu);
-  const int y = PU_GET_Y(part_mode, width_cu, y_cu, i_pu);
-  const int width = PU_GET_W(part_mode, width_cu, i_pu);
-  const int height = PU_GET_H(part_mode, width_cu, i_pu);
-
-  // Merge candidate A1 may not be used for the second PU of Nx2N, nLx2N and
-  // nRx2N partitions.
-  const bool merge_a1 = i_pu == 0 || width >= height;
-  // Merge candidate B1 may not be used for the second PU of 2NxN, 2NxnU and
-  // 2NxnD partitions.
-  const bool merge_b1 = i_pu == 0 || width <= height;
-
+  const uvg_config          *cfg      = &state->encoder_control->cfg;
+  const videoframe_t * const frame    = state->tile->frame;
+  const int                  width_cu = cu_loc->width;
+  const int                  height_cu= cu_loc->height;
 
   lcu_t                     *lcu      = info->lcu;
-  const int x_local = SUB_SCU(x);
-  const int y_local = SUB_SCU(y);
-  cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_local, y_local);
-  cur_pu->type = CU_IBC;
-  cur_pu->part_size = part_mode;
-  cur_pu->depth = depth;
-  cur_pu->tr_depth = depth;
-  cur_pu->qp = state->qp;
-  cur_pu->inter.mv_dir = 1;
+  const int                  x_local  = SUB_SCU(cu_loc->x);
+  const int                  y_local  = SUB_SCU(cu_loc->y);
+  cu_info_t                 *cur_pu   = LCU_GET_CU_AT_PX(lcu, x_local, y_local);
+  cur_pu->type                        = CU_IBC;
+  cur_pu->qp                          = state->qp;
+  cur_pu->inter.mv_dir                = 1;
 
   // Default to candidate 0
   CU_SET_MV_CAND(cur_pu, 0, 0);
-  
+
   FILL(*info, 0);
 
-  info->state          = state;
-  info->pic            = frame->source;
-  info->origin.x       = x;
-  info->origin.y       = y;
-  info->width          = width;
-  info->height         = height;
-  info->mvd_cost_func  = cfg->mv_rdo ? uvg_calc_ibc_mvd_cost_cabac : calc_ibc_mvd_cost;
-  info->optimized_sad  = uvg_get_optimized_sad(width);
-  info->lcu            = lcu;
+  info->state    = state;
+  info->pic      = frame->source;
+  info->origin.x = cu_loc->x;
+  info->origin.y = cu_loc->y;
+  info->width    = width_cu;
+  info->height   = height_cu;
+  info->mvd_cost_func =
+    cfg->mv_rdo ? uvg_calc_ibc_mvd_cost_cabac : calc_ibc_mvd_cost;
+  info->optimized_sad = uvg_get_optimized_sad(width_cu);
+  info->lcu           = lcu;
 
   // Search for merge mode candidates
   info->num_merge_cand = uvg_inter_get_merge_cand(
                           state,
-                          x, y,
-                          width, height,
-                          merge_a1, merge_b1,
+                          cu_loc,
                           info->merge_cand,
                           lcu);
 
@@ -853,7 +847,7 @@ static void search_pu_ibc(encoder_state_t * const state,
 #ifdef COMPLETE_PRED_MODE_BITS
   // Technically counting these bits would be correct, however counting
   // them universally degrades quality so this block is disabled by default
-  const double no_skip_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_skip_flag_model[uvg_get_skip_context(x, y, lcu, NULL)], 0);
+  const double no_skip_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_skip_flag_model[uvg_get_skip_context(cu_loc->x, cu_loc->y, lcu, NULL)], 0);
 #else
   const double no_skip_flag = 0;
 #endif
@@ -875,7 +869,7 @@ static void search_pu_ibc(encoder_state_t * const state,
     {
       continue;
     }
-    uvg_inter_pred_pu(state, info->lcu, x_cu, y_cu, width_cu, true, false, i_pu);
+    uvg_inter_pred_pu(state, info->lcu, true, false, cu_loc);
     merge->unit[merge->size] = *cur_pu;
     merge->unit[merge->size].type = CU_IBC;
     merge->unit[merge->size].merge_idx = merge_idx;
@@ -883,11 +877,11 @@ static void search_pu_ibc(encoder_state_t * const state,
     merge->unit[merge->size].skipped = false;
 
     double bits = merge_flag_cost + merge_idx + CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.cu_merge_idx_ext_model), merge_idx != 0);
-    if(state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) {
-      uvg_cu_cost_inter_rd2(state, x, y, depth, &merge->unit[merge->size], lcu, &merge->cost[merge->size], &bits);
+    if(state->encoder_control->cfg.rdo >= 2) {
+      uvg_cu_cost_inter_rd2(state, &merge->unit[merge->size], lcu, &merge->cost[merge->size], &bits, cu_loc);
     }
     else {
-      merge->cost[merge->size] = uvg_satd_any_size(width, height,
+      merge->cost[merge->size] = uvg_satd_any_size(width_cu, height_cu,
         lcu->rec.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH,
         lcu->ref.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH);
       bits += no_skip_flag;
@@ -909,7 +903,7 @@ static void search_pu_ibc(encoder_state_t * const state,
     
   // Early Skip Mode Decision
   bool has_chroma = state->encoder_control->chroma_format != UVG_CSP_400;
-  if (cfg->early_skip && cur_pu->part_size == SIZE_2Nx2N) {
+  if (cfg->early_skip) {
     for (int merge_key = 0; merge_key < num_rdo_cands; ++merge_key) {
       if(cfg->rdo >= 2 && merge->unit[merge->keys[merge_key]].skipped) {
         merge->size = 1;
@@ -919,6 +913,7 @@ static void search_pu_ibc(encoder_state_t * const state,
         merge->keys[0] = 0;
       }
       else if(cfg->rdo < 2) {
+        const uint8_t depth = 6 - uvg_g_convert_to_log2[cu_loc->width];
         // Reconstruct blocks with merge candidate.
         // Check luma CBF. Then, check chroma CBFs if luma CBF is not set
         // and chroma exists.
@@ -927,18 +922,18 @@ static void search_pu_ibc(encoder_state_t * const state,
         cur_pu->inter.mv_dir    = info->merge_cand[merge_idx].dir;
         cur_pu->inter.mv[0][0]  = info->merge_cand[merge_idx].mv[0][0];
         cur_pu->inter.mv[0][1]  = info->merge_cand[merge_idx].mv[0][1];
-        uvg_lcu_fill_trdepth(lcu, x, y, depth, MAX(1, depth), UVG_BOTH_T);
-        uvg_inter_recon_cu(state, lcu, x, y, width, true, false);
-        uvg_quantize_lcu_residual(state, true, false, false, x, y, depth, cur_pu, lcu, true, UVG_BOTH_T);
+        uvg_lcu_fill_trdepth(lcu, cu_loc->x, cu_loc->y, depth, MAX(1, depth), UVG_BOTH_T);
+        uvg_inter_recon_cu(state, lcu, true, false, cu_loc);
+        uvg_quantize_lcu_residual(state, true, false, false, cu_loc, depth, cur_pu, lcu, true, UVG_BOTH_T);
 
         if (cbf_is_set(cur_pu->cbf, depth, COLOR_Y)) {
           continue;
         }
         else if (has_chroma) {
-          uvg_inter_recon_cu(state, lcu, x, y, width, false, has_chroma);
+          uvg_inter_recon_cu(state, lcu, false, has_chroma, cu_loc);
           uvg_quantize_lcu_residual(state, false, has_chroma, 
             false, /*we are only checking for lack of coeffs so no need to check jccr*/
-            x, y, depth, cur_pu, lcu, true, UVG_BOTH_T);
+            cu_loc, depth, cur_pu, lcu, true, UVG_BOTH_T);
           if (!cbf_is_set_any(cur_pu->cbf, depth)) {
             cur_pu->type = CU_IBC;
             cur_pu->merge_idx = merge_idx;
@@ -964,15 +959,12 @@ static void search_pu_ibc(encoder_state_t * const state,
 
   // Do the motion search
 
-  uvg_inter_get_mv_cand(info->state,
-    info->origin.x,
-    info->origin.y,
-    info->width,
-    info->height,
+  uvg_inter_get_mv_cand(info->state,    
     info->mv_cand,
     cur_pu,
     lcu,
-    NULL);
+    0,
+    cu_loc);
 
   vector2d_t best_mv = { 0, 0 };
 
@@ -1003,9 +995,7 @@ static void search_pu_ibc(encoder_state_t * const state,
     best_cost = calculate_ibc_cost_satd(
       info->state,
       lcu,
-      info->origin.x,
-      info->origin.y,
-      info->width,
+      cu_loc,
       (best_mv.x >> INTERNAL_MV_PREC),
       (best_mv.y >> INTERNAL_MV_PREC));
     best_cost += best_bits * info->state->lambda;
@@ -1052,16 +1042,16 @@ static void search_pu_ibc(encoder_state_t * const state,
   };
 
 
-  if (state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) {
-    if (amvp[0].size) uvg_cu_cost_inter_rd2(state, x, y, depth, &amvp[0].unit[best_keys[0]], lcu, &amvp[0].cost[best_keys[0]], &amvp[0].bits[best_keys[0]]);    
+  if (state->encoder_control->cfg.rdo >= 2) {
+    if (amvp[0].size) uvg_cu_cost_inter_rd2(state, &amvp[0].unit[best_keys[0]], lcu, &amvp[0].cost[best_keys[0]], &amvp[0].bits[best_keys[0]], cu_loc);    
   }
 
 
   if(cfg->rdo < 2) {
     int predmode_ctx;
 
-    const int ibc_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.ibc_flag[0], 1) * 3;
-    const int skip_contest = uvg_get_skip_context(x, y, lcu, NULL, &predmode_ctx);
+    const float ibc_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.ibc_flag[0], 1);
+    const int skip_contest = uvg_get_skip_context(cu_loc->x, cu_loc->y, lcu, NULL, &predmode_ctx);
     const double no_skip_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_skip_flag_model[skip_contest], 0);
 
     const double pred_mode_bits = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_pred_mode_model[predmode_ctx], 0);
@@ -1077,33 +1067,29 @@ static void search_pu_ibc(encoder_state_t * const state,
 #include "threads.h"
 
 static int uvg_search_hash_cu_ibc(encoder_state_t* const state,
-  int x, int y, int depth,
+  const cu_loc_t* cu_loc,
   lcu_t* lcu,
   double* inter_cost,
   double* inter_bitcost)
 {
-  const int x_cu = x;
-  const int y_cu = y;
+  const int x_cu = cu_loc->x;
+  const int y_cu = cu_loc->y;
   const int part_mode = SIZE_2Nx2N;
   const uvg_config          *cfg      = &state->encoder_control->cfg;
   const videoframe_t * const frame    = state->tile->frame;
-  const int                  width_cu = LCU_WIDTH >> depth;
-  const int                  width    = PU_GET_W(part_mode, width_cu, 0);
-  const int                  height   = PU_GET_H(part_mode, width_cu, 0);
+  const int                  width_cu = cu_loc->width;
+  const int                  height_cu = cu_loc->height;
 
   const bool                 merge_a1  = true;
   const bool                 merge_b1  = true;
 
   ibc_search_info_t info;
 
-  const int  x_local = SUB_SCU(x);
-  const int  y_local = SUB_SCU(y);
+  const int  x_local = SUB_SCU(x_cu);
+  const int  y_local = SUB_SCU(y_cu);
   cu_info_t *cur_pu  = LCU_GET_CU_AT_PX(lcu, x_local, y_local);
 
   cur_pu->type       = CU_IBC;
-  cur_pu->part_size  = part_mode;
-  cur_pu->depth      = depth;
-  cur_pu->tr_depth   = depth;
   cur_pu->qp         = state->qp;
 
   // Default to candidate 0
@@ -1113,22 +1099,19 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state,
 
   info.state    = state;
   info.pic      = frame->source;
-  info.origin.x = x;
-  info.origin.y = y;
-  info.width    = width;
-  info.height   = height;
+  info.origin.x = cu_loc->x;
+  info.origin.y = cu_loc->y;
+  info.width    = width_cu;
+  info.height   = height_cu;
   info.mvd_cost_func =
     cfg->mv_rdo ? uvg_calc_ibc_mvd_cost_cabac : calc_ibc_mvd_cost;
-  info.optimized_sad  = uvg_get_optimized_sad(width);
+  info.optimized_sad  = uvg_get_optimized_sad(width_cu);
   info.lcu            = lcu;
 
   // Search for merge mode candidates
   info.num_merge_cand = uvg_inter_get_merge_cand(
     state,
-    x,
-    y,
-    width,
-    height,
+    cu_loc,
     merge_a1,
     merge_b1,
     info.merge_cand,
@@ -1154,8 +1137,8 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state,
   UVG_CLOCK_T   hashmap_end_real_time;
   UVG_GET_TIME(&hashmap_start_real_time);
 
-  int           xx  = x;
-  int           yy  = y;
+  int           xx  = x_cu;
+  int           yy  = y_cu;
 
   int           best_mv_x    = INT_MAX>>2;
   int           best_mv_y    = INT_MAX>>2;
@@ -1185,12 +1168,12 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state,
       int pos_y = result->value & 0xffff;
       int mv_x = pos_x - xx;
       int mv_y = pos_y - yy;
-      if (pos_x <= xx - width && pos_y <= yy - height) {
+      if (pos_x <= xx - width_cu && pos_y <= yy - height_cu) {
         valid_mv = intmv_within_ibc_range(&info, mv_x, mv_y);
         if (valid_mv) {
           bool full_block = true; // Is the full block covered by the IBC?
-          for (int offset_x = UVG_HASHMAP_BLOCKSIZE; offset_x < width; offset_x+=UVG_HASHMAP_BLOCKSIZE) {
-            for (int offset_y = 0; offset_y < height; offset_y += UVG_HASHMAP_BLOCKSIZE) {
+          for (int offset_x = UVG_HASHMAP_BLOCKSIZE; offset_x < width_cu; offset_x+=UVG_HASHMAP_BLOCKSIZE) {
+            for (int offset_y = 0; offset_y < height_cu; offset_y += UVG_HASHMAP_BLOCKSIZE) {
               uint32_t crc_other_blocks = state->tile->frame->ibc_hashmap_pos_to_hash[
                 ((yy+offset_y) / UVG_HASHMAP_BLOCKSIZE)*state->tile->frame->ibc_hashmap_pos_to_hash_stride + (xx+offset_x) / UVG_HASHMAP_BLOCKSIZE];
 
@@ -1220,7 +1203,7 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state,
               best_mv_y              = mv_y;
               ibc_cost               = cost;
               ibc_bitcost            = bits;
-              fprintf(stderr, "Found best IBC!! %dx%d %dx%d: %d,%d\r\n", x,y, width,width, mv_x, mv_y);
+              fprintf(stderr, "Found best IBC!! %dx%d %dx%d: %d,%d\r\n", x_cu,y_cu, width_cu,height_cu, mv_x, mv_y);
               found_block = true;
               //break;
             }
@@ -1274,11 +1257,9 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state,
   uvg_inter_recon_cu(
     state,
     lcu,
-    x,
-    y,
-    CU_WIDTH_FROM_DEPTH(depth),
     true,
-    state->encoder_control->chroma_format != UVG_CSP_400);
+    state->encoder_control->chroma_format != UVG_CSP_400,
+    cu_loc);
 
   if (*inter_cost < MAX_DOUBLE) {
     assert(fracmv_within_ibc_range(
@@ -1305,17 +1286,18 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state,
  * \param inter_bitcost Return inter bitcost
  */
 void uvg_search_cu_ibc(encoder_state_t * const state,
-                         int x, int y, int depth,
-                         lcu_t *lcu,
-                         double   *inter_cost,
-                         double* inter_bitcost)
+                       const cu_loc_t * const  cu_loc,
+                       lcu_t *lcu,
+                       double   *inter_cost,
+                       double* inter_bitcost)
 {
   *inter_cost = MAX_DOUBLE;
   *inter_bitcost = MAX_INT;
+
    // Quick hashmap search
   /* uvg_search_hash_cu_ibc(
     state,
-                          x, y, depth,
+                          cu_loc,
                           lcu,
                           inter_cost,
                           inter_bitcost);
@@ -1330,7 +1312,7 @@ void uvg_search_cu_ibc(encoder_state_t * const state,
   info.lcu = lcu;
 
   search_pu_ibc(state,
-                  x, y, depth,
+                  cu_loc,
                   SIZE_2Nx2N, 0,
                   amvp,
                   &merge,
@@ -1374,14 +1356,14 @@ void uvg_search_cu_ibc(encoder_state_t * const state,
     return;
   }
 
-  const int x_local = SUB_SCU(x);
-  const int y_local = SUB_SCU(y);
+  const int  x_local = SUB_SCU(cu_loc->x);
+  const int  y_local = SUB_SCU(cu_loc->y);
   cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_local, y_local);
   *cur_pu = *best_inter_pu;
   cur_pu->type       = CU_IBC;
 
-  uvg_inter_recon_cu(state, lcu, x, y, CU_WIDTH_FROM_DEPTH(depth),
-    true, state->encoder_control->chroma_format != UVG_CSP_400);   
+  uvg_inter_recon_cu(state, lcu, 
+    true, state->encoder_control->chroma_format != UVG_CSP_400, cu_loc);   
 
   if (*inter_cost < MAX_DOUBLE) {    
     assert(fracmv_within_ibc_range(&info, cur_pu->inter.mv[0][0], cur_pu->inter.mv[0][1]));
diff --git a/src/search_ibc.h b/src/search_ibc.h
index 14ce3b6f..b3c4e544 100644
--- a/src/search_ibc.h
+++ b/src/search_ibc.h
@@ -46,7 +46,7 @@
 
 
 void uvg_search_cu_ibc(encoder_state_t * const state,
-                         int x, int y, int depth,
+                         const cu_loc_t * const  cu_loc,
                          lcu_t *lcu,
                          double *inter_cost,
                          double* inter_bitcost);
diff --git a/src/search_inter.c b/src/search_inter.c
index 93598ff2..53587b84 100644
--- a/src/search_inter.c
+++ b/src/search_inter.c
@@ -1293,8 +1293,8 @@ static void apply_mv_scaling(int32_t current_poc,
 /**
  * \brief Perform inter search for a single reference frame.
  */
-static void search_pu_inter_ref(inter_search_info_t *info,
-  int depth,
+static void search_pu_inter_ref(
+  inter_search_info_t *info,
   lcu_t *lcu,
   cu_info_t *cur_cu,
   unit_stats_map_t *amvp)
@@ -1327,15 +1327,15 @@ static void search_pu_inter_ref(inter_search_info_t *info,
   // Get MV candidates
   cur_cu->inter.mv_ref[ref_list] = ref_list_idx[ref_list];
 
+  cu_loc_t cu_loc;
+  uvg_cu_loc_ctor(&cu_loc, info->origin.x, info->origin.y, info->width, info->height);
+
   uvg_inter_get_mv_cand(info->state,
-    info->origin.x,
-    info->origin.y,
-    info->width,
-    info->height,
-    info->mv_cand,
-    cur_cu,
-    lcu,
-    ref_list);
+                        info->mv_cand,
+                        cur_cu,
+                        lcu,
+                        ref_list,
+                        &cu_loc);
 
   vector2d_t best_mv = { 0, 0 };
 
@@ -1498,11 +1498,13 @@ static void search_pu_inter_ref(inter_search_info_t *info,
 /**
  * \brief Search bipred modes for a PU.
  */
-static void search_pu_inter_bipred(inter_search_info_t *info,
-                                   int depth,
-                                   lcu_t *lcu,
-                                   unit_stats_map_t *amvp_bipred)
+static void search_pu_inter_bipred(
+  inter_search_info_t *info,
+  lcu_t *lcu,
+  unit_stats_map_t *amvp_bipred)
 {
+  cu_loc_t cu_loc;
+  uvg_cu_loc_ctor(&cu_loc, info->origin.x, info->origin.y, info->width, info->height);
   const image_list_t *const ref = info->state->frame->ref;
   uint8_t (*ref_LX)[16] = info->state->frame->ref_LX;
   const videoframe_t * const frame = info->state->tile->frame;
@@ -1551,7 +1553,7 @@ static void search_pu_inter_bipred(inter_search_info_t *info,
     bipred_pu->skipped = false;
 
     for (int reflist = 0; reflist < 2; reflist++) {
-      uvg_inter_get_mv_cand(info->state, x, y, width, height, info->mv_cand, bipred_pu, lcu, reflist);
+      uvg_inter_get_mv_cand(info->state, info->mv_cand, bipred_pu, lcu, reflist, &cu_loc);
     }
 
     // Don't try merge candidates that don't satisfy mv constraints.
@@ -1564,13 +1566,11 @@ static void search_pu_inter_bipred(inter_search_info_t *info,
     uvg_inter_recon_bipred(info->state,
                            ref->images[ref_LX[0][merge_cand[i].ref[0]]],
                            ref->images[ref_LX[1][merge_cand[j].ref[1]]],
-                           x, y,
-                           width,
-                           height,
                            mv,
                            lcu,
                            true,
-                           false);
+                           false,
+                           &cu_loc);
 
     const uvg_pixel *rec = &lcu->rec.y[SUB_SCU(y) * LCU_WIDTH + SUB_SCU(x)];
     const uvg_pixel *src = &frame->source->y[x + y * frame->source->stride];
@@ -1666,11 +1666,9 @@ static bool merge_candidate_in_list(inter_merge_cand_t *all_cands,
  * \param amvp        Return searched AMVP PUs sorted by costs
  * \param merge       Return searched Merge PUs sorted by costs
  */
-static void search_pu_inter(encoder_state_t * const state,
-  int x_cu, int y_cu,
-  int depth,
-  part_mode_t part_mode,
-  int i_pu,
+static void search_pu_inter(
+  encoder_state_t * const state,
+  const cu_loc_t* const cu_loc,
   lcu_t *lcu,
   unit_stats_map_t *amvp,
   unit_stats_map_t *merge,
@@ -1678,26 +1676,14 @@ static void search_pu_inter(encoder_state_t * const state,
 {
   const uvg_config *cfg = &state->encoder_control->cfg;
   const videoframe_t * const frame = state->tile->frame;
-  const int width_cu = LCU_WIDTH >> depth;
-  const int height_cu = width_cu; // TODO: non-square blocks
-  const int x = PU_GET_X(part_mode, width_cu, x_cu, i_pu);
-  const int y = PU_GET_Y(part_mode, width_cu, y_cu, i_pu);
-  const int width = PU_GET_W(part_mode, width_cu, i_pu);
-  const int height = PU_GET_H(part_mode, width_cu, i_pu);
+  const int width_cu = cu_loc->width;
+  const int height_cu = cu_loc->height; 
 
-  // Merge candidate A1 may not be used for the second PU of Nx2N, nLx2N and
-  // nRx2N partitions.
-  const bool merge_a1 = i_pu == 0 || width >= height;
-  // Merge candidate B1 may not be used for the second PU of 2NxN, 2NxnU and
-  // 2NxnD partitions.
-  const bool merge_b1 = i_pu == 0 || width <= height;
 
-  const int x_local = SUB_SCU(x);
-  const int y_local = SUB_SCU(y);
+  const int x_local = SUB_SCU(cu_loc->x);
+  const int y_local = SUB_SCU(cu_loc->y);
   cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_local, y_local);
   cur_pu->type = CU_NOTSET;
-  cur_pu->part_size = part_mode;
-  cur_pu->depth = depth;
   cur_pu->qp = state->qp;
 
   // Default to candidate 0
@@ -1708,19 +1694,17 @@ static void search_pu_inter(encoder_state_t * const state,
 
   info->state          = state;
   info->pic            = frame->source;
-  info->origin.x       = x;
-  info->origin.y       = y;
-  info->width          = width;
-  info->height         = height;
+  info->origin.x       = cu_loc->x;
+  info->origin.y       = cu_loc->y;
+  info->width          = width_cu;
+  info->height         = height_cu;
   info->mvd_cost_func  = cfg->mv_rdo ? uvg_calc_mvd_cost_cabac : calc_mvd_cost;
-  info->optimized_sad  = uvg_get_optimized_sad(width);
+  info->optimized_sad  = uvg_get_optimized_sad(width_cu);
 
   // Search for merge mode candidates
   info->num_merge_cand = uvg_inter_get_merge_cand(
       state,
-      x, y,
-      width, height,
-      merge_a1, merge_b1,
+      cu_loc,
       info->merge_cand,
       lcu
   );
@@ -1755,7 +1739,7 @@ static void search_pu_inter(encoder_state_t * const state,
     // If bipred is not enabled, do not try candidates with mv_dir == 3.
     // Bipred is also forbidden for 4x8 and 8x4 blocks by the standard. 
     if (cur_pu->inter.mv_dir == 3 && !state->encoder_control->cfg.bipred) continue;
-    if (cur_pu->inter.mv_dir == 3 && !(width + height > 12)) continue;
+    if (cur_pu->inter.mv_dir == 3 && !(cu_loc->width + cu_loc->height > 12)) continue;
 
     bool is_duplicate = merge_candidate_in_list(info->merge_cand, cur_cand, merge);
 
@@ -1769,7 +1753,7 @@ static void search_pu_inter(encoder_state_t * const state,
     {
       continue;
     }
-    uvg_inter_pred_pu(state, lcu, x_cu, y_cu, width_cu, true, false, i_pu);
+    uvg_inter_pred_pu(state, lcu, true, false, cu_loc);
     merge->unit[merge->size] = *cur_pu;
     merge->unit[merge->size].type = CU_INTER;
     merge->unit[merge->size].merge_idx = merge_idx;
@@ -1777,11 +1761,11 @@ static void search_pu_inter(encoder_state_t * const state,
     merge->unit[merge->size].skipped = false;
 
     double bits = merge_flag_cost + merge_idx + CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.cu_merge_idx_ext_model), merge_idx != 0);
-    if(state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) {
-      uvg_cu_cost_inter_rd2(state, x, y, depth, &merge->unit[merge->size], lcu, &merge->cost[merge->size], &bits);
+    if(state->encoder_control->cfg.rdo >= 2) {
+      uvg_cu_cost_inter_rd2(state, &merge->unit[merge->size], lcu, &merge->cost[merge->size], &bits, cu_loc);
     }
     else {
-      merge->cost[merge->size] = uvg_satd_any_size(width, height,
+      merge->cost[merge->size] = uvg_satd_any_size(cu_loc->width, cu_loc->height,
         lcu->rec.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH,
         lcu->ref.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH);
       bits += no_skip_flag;
@@ -1803,7 +1787,7 @@ static void search_pu_inter(encoder_state_t * const state,
     
   // Early Skip Mode Decision
   bool has_chroma = state->encoder_control->chroma_format != UVG_CSP_400;
-  if (cfg->early_skip && cur_pu->part_size == SIZE_2Nx2N) {
+  if (cfg->early_skip) {
     for (int merge_key = 0; merge_key < num_rdo_cands; ++merge_key) {
       if(cfg->rdo >= 2 && merge->unit[merge->keys[merge_key]].skipped) {
         merge->size = 1;
@@ -1813,6 +1797,8 @@ static void search_pu_inter(encoder_state_t * const state,
         merge->keys[0] = 0;
       }
       else if(cfg->rdo < 2) {
+
+        const uint8_t depth = 6 - uvg_g_convert_to_log2[cu_loc->width];
         // Reconstruct blocks with merge candidate.
         // Check luma CBF. Then, check chroma CBFs if luma CBF is not set
         // and chroma exists.
@@ -1825,23 +1811,20 @@ static void search_pu_inter(encoder_state_t * const state,
         cur_pu->inter.mv[0][1]  = info->merge_cand[merge_idx].mv[0][1];
         cur_pu->inter.mv[1][0]  = info->merge_cand[merge_idx].mv[1][0];
         cur_pu->inter.mv[1][1]  = info->merge_cand[merge_idx].mv[1][1];
-        uvg_lcu_fill_trdepth(lcu, x, y, depth, MAX(1, depth), UVG_BOTH_T);
-        uvg_inter_recon_cu(state, lcu, x, y, width, true, false);
+        uvg_lcu_fill_trdepth(lcu, cu_loc->x, cu_loc->y, depth, MAX(1, depth), UVG_BOTH_T);
+        uvg_inter_recon_cu(state, lcu, true, false, cu_loc);
 
-        cu_loc_t loc;
-        uvg_cu_loc_ctor(&loc, x, y, width_cu, height_cu);
-
-        uvg_quantize_lcu_residual(state, true, false, false, &loc, depth, cur_pu, lcu, true, UVG_BOTH_T);
+        uvg_quantize_lcu_residual(state, true, false, false, cu_loc, depth, cur_pu, lcu, true, UVG_BOTH_T);
 
         if (cbf_is_set(cur_pu->cbf, depth, COLOR_Y)) {
           continue;
         }
         else if (has_chroma) {
-          uvg_inter_recon_cu(state, lcu, x, y, width, false, has_chroma);
+          uvg_inter_recon_cu(state, lcu, false, has_chroma, cu_loc);
           uvg_quantize_lcu_residual(state,
                                     false, has_chroma,
                                     false, /*we are only checking for lack of coeffs so no need to check jccr*/
-                                    &loc, depth, cur_pu, lcu,
+                                    cu_loc, depth, cur_pu, lcu,
                                     true,
             UVG_BOTH_T);
           if (!cbf_is_set_any(cur_pu->cbf, depth)) {
@@ -1876,7 +1859,7 @@ static void search_pu_inter(encoder_state_t * const state,
     info->ref_idx = ref_idx;
     info->ref = state->frame->ref->images[ref_idx];
 
-    search_pu_inter_ref(info, depth, lcu, cur_pu, amvp);
+    search_pu_inter_ref(info, lcu, cur_pu, amvp);
   }
 
   assert(amvp[0].size <= MAX_UNIT_STATS_MAP_SIZE);
@@ -1941,14 +1924,11 @@ static void search_pu_inter(encoder_state_t * const state,
         info->ref = ref->images[info->ref_idx];
 
         uvg_inter_get_mv_cand(info->state,
-          info->origin.x,
-          info->origin.y,
-          info->width,
-          info->height,
-          info->mv_cand,
-          unipred_pu,
-          lcu,
-          list);
+                              info->mv_cand,
+                              unipred_pu,
+                              lcu,
+                              list,
+                              cu_loc);
 
         double     frac_cost = MAX_DOUBLE;
         double   frac_bits = MAX_INT;
@@ -1969,8 +1949,8 @@ static void search_pu_inter(encoder_state_t * const state,
           unipred_pu->inter.mv[list][1] = frac_mv.y;
           CU_SET_MV_CAND(unipred_pu, list, cu_mv_cand);
 
-          if (state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) {
-            uvg_cu_cost_inter_rd2(state, x, y, depth, unipred_pu, lcu, &frac_cost, &frac_bits);
+          if (state->encoder_control->cfg.rdo >= 2) {
+            uvg_cu_cost_inter_rd2(state, unipred_pu, lcu, &frac_cost, &frac_bits, cu_loc);
           }
 
           amvp[list].cost[key] = frac_cost;
@@ -1992,15 +1972,15 @@ static void search_pu_inter(encoder_state_t * const state,
     amvp[list].size = n_best;
   }
 
-  if (state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N && cfg->fme_level == 0) {
-    if (amvp[0].size) uvg_cu_cost_inter_rd2(state, x, y, depth, &amvp[0].unit[best_keys[0]], lcu, &amvp[0].cost[best_keys[0]], &amvp[0].bits[best_keys[0]]);
-    if (amvp[1].size) uvg_cu_cost_inter_rd2(state, x, y, depth, &amvp[1].unit[best_keys[1]], lcu, &amvp[1].cost[best_keys[1]], &amvp[1].bits[best_keys[1]]);
+  if (state->encoder_control->cfg.rdo >= 2 && cfg->fme_level == 0) {
+    if (amvp[0].size) uvg_cu_cost_inter_rd2(state, &amvp[0].unit[best_keys[0]], lcu, &amvp[0].cost[best_keys[0]], &amvp[0].bits[best_keys[0]], cu_loc);
+    if (amvp[1].size) uvg_cu_cost_inter_rd2(state, &amvp[1].unit[best_keys[1]], lcu, &amvp[1].cost[best_keys[1]], &amvp[1].bits[best_keys[1]], cu_loc);
   }
 
   // Search bi-pred positions
   bool can_use_bipred = state->frame->slicetype == UVG_SLICE_B
     && cfg->bipred
-    && width + height >= 16; // 4x8 and 8x4 PBs are restricted to unipred
+    && cu_loc->width + cu_loc->height >= 16; // 4x8 and 8x4 PBs are restricted to unipred
 
   if (can_use_bipred) {
 
@@ -2031,25 +2011,23 @@ static void search_pu_inter(encoder_state_t * const state,
       bipred_pu->skipped = false;
 
       for (int reflist = 0; reflist < 2; reflist++) {
-        uvg_inter_get_mv_cand(info->state, x, y, width, height, info->mv_cand, bipred_pu, lcu, reflist);
+        uvg_inter_get_mv_cand(info->state, info->mv_cand, bipred_pu, lcu, reflist, cu_loc);
       }
 
       uvg_inter_recon_bipred(info->state,
-        ref->images[ref_LX[0][bipred_pu->inter.mv_ref[0]]],
-        ref->images[ref_LX[1][bipred_pu->inter.mv_ref[1]]],
-        x, y,
-        width,
-        height,
-        mv,
-        lcu,
-        true,
-        false);
+                             ref->images[ref_LX[0][bipred_pu->inter.mv_ref[0]]],
+                             ref->images[ref_LX[1][bipred_pu->inter.mv_ref[1]]],
+                             mv, lcu,
+                             true,
+                             false,
+                             cu_loc
+        );
 
-      const uvg_pixel *rec = &lcu->rec.y[SUB_SCU(y) * LCU_WIDTH + SUB_SCU(x)];
-      const uvg_pixel *src = &lcu->ref.y[SUB_SCU(y) * LCU_WIDTH + SUB_SCU(x)];
+      const uvg_pixel *rec = &lcu->rec.y[SUB_SCU(cu_loc->y) * LCU_WIDTH + SUB_SCU(cu_loc->x)];
+      const uvg_pixel *src = &lcu->ref.y[SUB_SCU(cu_loc->y) * LCU_WIDTH + SUB_SCU(cu_loc->x)];
 
       best_bipred_cost =
-        uvg_satd_any_size(width, height, rec, LCU_WIDTH, src, LCU_WIDTH);
+        uvg_satd_any_size(cu_loc->width, cu_loc->height, rec, LCU_WIDTH, src, LCU_WIDTH);
 
       double bitcost[2] = { 0, 0 };
 
@@ -2096,17 +2074,17 @@ static void search_pu_inter(encoder_state_t * const state,
     }
 
     // TODO: this probably should have a separate command line option
-    if (cfg->rdo >= 3) search_pu_inter_bipred(info, depth, lcu, &amvp[2]);
+    if (cfg->rdo >= 3) search_pu_inter_bipred(info, lcu, &amvp[2]);
     
     assert(amvp[2].size <= MAX_UNIT_STATS_MAP_SIZE);
     uvg_sort_keys_by_cost(&amvp[2]);
-    if (amvp[2].size > 0 && state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) {
-      uvg_cu_cost_inter_rd2(state, x, y, depth, &amvp[2].unit[amvp[2].keys[0]], lcu, &amvp[2].cost[amvp[2].keys[0]], &amvp[2].bits[amvp[2].keys[0]]);
+    if (amvp[2].size > 0 && state->encoder_control->cfg.rdo >= 2) {
+      uvg_cu_cost_inter_rd2(state, &amvp[2].unit[amvp[2].keys[0]], lcu, &amvp[2].cost[amvp[2].keys[0]], &amvp[2].bits[amvp[2].keys[0]], cu_loc);
     }
   }
   if(cfg->rdo < 2) {
     int predmode_ctx;
-    const int skip_contest = uvg_get_skip_context(x, y, lcu, NULL, &predmode_ctx);
+    const int skip_contest = uvg_get_skip_context(cu_loc->x, cu_loc->y, lcu, NULL, &predmode_ctx);
     const double no_skip_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_skip_flag_model[skip_contest], 0);
 
     const double pred_mode_bits = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_pred_mode_model[predmode_ctx], 0);
@@ -2140,25 +2118,23 @@ static void search_pu_inter(encoder_state_t * const state,
 * \param inter_cost    Return inter cost
 * \param inter_bitcost Return inter bitcost
 */
-void uvg_cu_cost_inter_rd2(encoder_state_t * const state,
-                           int x, int y, int depth,
-                           cu_info_t* cur_cu,
-                           lcu_t *lcu,
-                           double   *inter_cost,
-                           double* inter_bitcost){
-  
-  int tr_depth = MAX(1, depth);
-  if (cur_cu->part_size != SIZE_2Nx2N) {
-    tr_depth = depth + 1;
-  }
-  uvg_lcu_fill_trdepth(lcu, x, y, depth, tr_depth, UVG_BOTH_T);
+void uvg_cu_cost_inter_rd2(
+  encoder_state_t * const state,
+  cu_info_t* cur_cu,
+  lcu_t *lcu,
+  double   *inter_cost,
+  double* inter_bitcost,
+  const cu_loc_t* const cu_loc){
 
-  const int x_px = SUB_SCU(x);
-  const int y_px = SUB_SCU(y);
+  const uint8_t depth = 6 - uvg_g_convert_to_log2[cu_loc->width];
+  int tr_depth = MAX(1, depth);
+
+  uvg_lcu_fill_trdepth(lcu, cu_loc->x, cu_loc->y, depth, tr_depth, UVG_BOTH_T);
+
+  const int x_px = SUB_SCU(cu_loc->x);
+  const int y_px = SUB_SCU(cu_loc->y);
   const int width = LCU_WIDTH >> depth;
   const int height = width; // TODO: non-square blocks
-  cu_loc_t loc;
-  uvg_cu_loc_ctor(&loc, x, y, width, height);
 
   cabac_data_t cabac_copy;
   memcpy(&cabac_copy, &state->search_cabac, sizeof(cabac_copy));
@@ -2169,7 +2145,7 @@ void uvg_cu_cost_inter_rd2(encoder_state_t * const state,
   *cur_pu = *cur_cu;
 
   const bool reconstruct_chroma = state->encoder_control->chroma_format != UVG_CSP_400;
-  uvg_inter_recon_cu(state, lcu, x, y, CU_WIDTH_FROM_DEPTH(depth), true, reconstruct_chroma);
+  uvg_inter_recon_cu(state, lcu, true, reconstruct_chroma, cu_loc);
 
   int index = y_px * LCU_WIDTH + x_px;
   double ssd = uvg_pixels_calc_ssd(&lcu->ref.y[index], &lcu->rec.y[index],
@@ -2187,13 +2163,13 @@ void uvg_cu_cost_inter_rd2(encoder_state_t * const state,
   }
   double no_cbf_bits;
   double bits = 0;
-  const int skip_context = uvg_get_skip_context(x, y, lcu, NULL, NULL);
-  if (cur_cu->merged && cur_cu->part_size == SIZE_2Nx2N) {
+  const int skip_context = uvg_get_skip_context(cu_loc->x, cu_loc->y, lcu, NULL, NULL);
+  if (cur_cu->merged) {
     no_cbf_bits = CTX_ENTROPY_FBITS(&state->cabac.ctx.cu_skip_flag_model[skip_context], 1) + *inter_bitcost;
-    bits += uvg_mock_encode_coding_unit(state, cabac, x, y, depth, lcu, cur_cu, UVG_BOTH_T);
+    bits += uvg_mock_encode_coding_unit(state, cabac, cu_loc, lcu, cur_cu, UVG_BOTH_T);
   }
   else {
-    no_cbf_bits = uvg_mock_encode_coding_unit(state, cabac, x, y, depth, lcu, cur_cu, UVG_BOTH_T);
+    no_cbf_bits = uvg_mock_encode_coding_unit(state, cabac, cu_loc, lcu, cur_cu, UVG_BOTH_T);
     bits += no_cbf_bits - CTX_ENTROPY_FBITS(&cabac->ctx.cu_qt_root_cbf_model, 0) + CTX_ENTROPY_FBITS(&cabac->ctx.cu_qt_root_cbf_model, 1);
   }
   double no_cbf_cost = ssd + no_cbf_bits * state->lambda;
@@ -2207,7 +2183,8 @@ void uvg_cu_cost_inter_rd2(encoder_state_t * const state,
     uvg_quantize_lcu_residual(state,
                               true,
                               false,
-                              false, &loc,
+                              false,
+                              cu_loc,
                               depth,
                               cur_cu,
                               lcu,
@@ -2243,7 +2220,7 @@ void uvg_cu_cost_inter_rd2(encoder_state_t * const state,
       depth,
       lcu,
       &cabac_copy,
-      &loc,
+      cu_loc,
       index,
       0,
       cur_cu,
@@ -2274,7 +2251,7 @@ void uvg_cu_cost_inter_rd2(encoder_state_t * const state,
     uvg_quantize_lcu_residual(state,
                               true, reconstruct_chroma,
                               reconstruct_chroma && state->encoder_control->cfg.jccr,
-                              &loc,
+                              cu_loc,
                               depth,
                               cur_cu,
                               lcu,
@@ -2308,7 +2285,7 @@ void uvg_cu_cost_inter_rd2(encoder_state_t * const state,
 
   if(no_cbf_cost < *inter_cost) {
     cur_cu->cbf = 0;
-    if (cur_cu->merged && cur_cu->part_size == SIZE_2Nx2N) {
+    if (cur_cu->merged) {
       cur_cu->skipped = 1;
     }
     *inter_cost = no_cbf_cost;
@@ -2332,11 +2309,12 @@ void uvg_cu_cost_inter_rd2(encoder_state_t * const state,
  * \param inter_cost    Return inter cost
  * \param inter_bitcost Return inter bitcost
  */
-void uvg_search_cu_inter(encoder_state_t * const state,
-                         int x, int y, int depth,
-                         lcu_t *lcu,
-                         double   *inter_cost,
-                         double* inter_bitcost)
+void uvg_search_cu_inter(
+  encoder_state_t * const state,
+  const cu_loc_t* const cu_loc,
+  lcu_t *lcu,
+  double   *inter_cost,
+  double* inter_bitcost)
 {
   *inter_cost = MAX_DOUBLE;
   *inter_bitcost = MAX_INT;
@@ -2349,12 +2327,8 @@ void uvg_search_cu_inter(encoder_state_t * const state,
   inter_search_info_t info;
 
   search_pu_inter(state,
-                  x, y, depth,
-                  SIZE_2Nx2N, 0,
-                  lcu,
-                  amvp,
-                  &merge,
-                  &info);
+                  cu_loc, lcu, amvp,
+                  &merge, &info);
 
   // Early Skip CU decision
   if (merge.size == 1 && merge.unit[0].skipped) {
@@ -2396,13 +2370,14 @@ void uvg_search_cu_inter(encoder_state_t * const state,
     return;
   }
 
-  const int x_local = SUB_SCU(x);
-  const int y_local = SUB_SCU(y);
+  const int x_local = SUB_SCU(cu_loc->x);
+  const int y_local = SUB_SCU(cu_loc->y);
   cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_local, y_local);
   *cur_pu = *best_inter_pu;
 
-  uvg_inter_recon_cu(state, lcu, x, y, CU_WIDTH_FROM_DEPTH(depth),
-    true, state->encoder_control->chroma_format != UVG_CSP_400);   
+  uvg_inter_recon_cu(state, lcu,
+                     true, state->encoder_control->chroma_format != UVG_CSP_400,
+                     cu_loc);   
 
   if (*inter_cost < MAX_DOUBLE && cur_pu->inter.mv_dir & 1) {
     assert(fracmv_within_tile(&info, cur_pu->inter.mv[0][0], cur_pu->inter.mv[0][1]));
diff --git a/src/search_inter.h b/src/search_inter.h
index d76dd927..cdabd15a 100644
--- a/src/search_inter.h
+++ b/src/search_inter.h
@@ -73,11 +73,12 @@ typedef double uvg_mvd_cost_func(const encoder_state_t *state,
                                   int32_t ref_idx,
                                   double *bitcost);
 
-void uvg_search_cu_inter(encoder_state_t * const state,
-                         int x, int y, int depth,
-                         lcu_t *lcu,
-                         double *inter_cost,
-                         double* inter_bitcost);
+void uvg_search_cu_inter(
+  encoder_state_t * const state,
+  const cu_loc_t* const cu_loc,
+  lcu_t *lcu,
+  double *inter_cost,
+  double* inter_bitcost);
 
 
 
@@ -85,12 +86,13 @@ unsigned uvg_inter_satd_cost(const encoder_state_t* state,
                              const lcu_t *lcu,
                              int x,
                              int y);
-void uvg_cu_cost_inter_rd2(encoder_state_t* const state,
-  int x, int y, int depth,
+void uvg_cu_cost_inter_rd2(
+  encoder_state_t* const state,
   cu_info_t* cur_cu,
   lcu_t* lcu,
   double* inter_cost,
-  double* inter_bitcost);
+  double* inter_bitcost,
+  const cu_loc_t* const cu_loc);
 
 int uvg_get_skip_context(int x, int y, lcu_t* const lcu, cu_array_t* const cu_a, int* predmode_ctx);
 
diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c
index 2783454d..1d3c117f 100644
--- a/src/strategies/avx2/intra-avx2.c
+++ b/src/strategies/avx2/intra-avx2.c
@@ -294,13 +294,6 @@ static void uvg_angular_pred_avx2(
               f[yy][2] = 16 + offset;
               f[yy][3] = offset;
             }
-            // Cubic must be used if ref line != 0 or if isp mode != 0
-            if (multi_ref_index || isp) {
-              use_cubic = true;
-            }
-            const int16_t filter_coeff[4] = { 16 - (delta_fract[yy] >> 1), 32 - (delta_fract[yy] >> 1), 16 + (delta_fract[yy] >> 1), delta_fract[yy] >> 1 };
-            const int16_t *temp_f = use_cubic ? cubic_filter[delta_fract[yy]] : filter_coeff;
-            memcpy(f[yy], temp_f, 4 * sizeof(*temp_f));
           }
 
           // Do 4-tap intra interpolation filtering
diff --git a/src/strategies/avx2/quant-avx2.c b/src/strategies/avx2/quant-avx2.c
index b6d062b0..bc70daab 100644
--- a/src/strategies/avx2/quant-avx2.c
+++ b/src/strategies/avx2/quant-avx2.c
@@ -708,7 +708,6 @@ int uvg_quantize_residual_avx2(encoder_state_t *const state,
       (width > 4 || !state->encoder_control->cfg.rdoq_skip) && !use_trskip)
   {
     int8_t tr_depth = cur_cu->tr_depth - cur_cu->depth;
-    tr_depth += (cur_cu->part_size == SIZE_NxN ? 1 : 0);
     uvg_rdoq(state, coeff, coeff_out, width, height, color,
       scan_order, cur_cu->type, tr_depth, cur_cu->cbf, lfnst_index);
   }
diff --git a/src/strategies/generic/quant-generic.c b/src/strategies/generic/quant-generic.c
index 4215fc81..be396a8b 100644
--- a/src/strategies/generic/quant-generic.c
+++ b/src/strategies/generic/quant-generic.c
@@ -316,7 +316,6 @@ int uvg_quant_cbcr_residual_generic(
     (width > 4 || !state->encoder_control->cfg.rdoq_skip))
   {
     int8_t tr_depth = cur_cu->tr_depth - cur_cu->depth;
-    tr_depth += (cur_cu->part_size == SIZE_NxN ? 1 : 0);
     uvg_rdoq(state, coeff, coeff_out, width, width, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U,
              scan_order, cur_cu->type, tr_depth, cur_cu->cbf,
       cur_cu->cr_lfnst_idx);
@@ -499,7 +498,6 @@ int uvg_quantize_residual_generic(encoder_state_t *const state,
       (width > 4 || !state->encoder_control->cfg.rdoq_skip) && !use_trskip)
   {
     int8_t tr_depth = cur_cu->tr_depth - cur_cu->depth;
-    tr_depth += (cur_cu->part_size == SIZE_NxN ? 1 : 0);
     uvg_rdoq(state, coeff, coeff_out, width, height, color,
              scan_order, cur_cu->type, tr_depth, cur_cu->cbf,
       lfnst_index);
diff --git a/src/transform.c b/src/transform.c
index b260eea1..a497003b 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -490,7 +490,7 @@ void uvg_chroma_transform_search(
   int depth,
   lcu_t* const lcu,
   cabac_data_t* temp_cabac,
-  cu_loc_t *cu_loc,
+  const cu_loc_t* const cu_loc,
   const int offset,
   const uint8_t mode,
   cu_info_t* pred_cu,
diff --git a/src/transform.h b/src/transform.h
index 6fdef411..a7427ea0 100644
--- a/src/transform.h
+++ b/src/transform.h
@@ -108,7 +108,7 @@ void uvg_chroma_transform_search(
   int depth,
   lcu_t* const lcu,
   cabac_data_t* temp_cabac,
-  cu_loc_t *cu_loc,
+  const cu_loc_t* const cu_loc,
   const int offset,
   const uint8_t mode,
   cu_info_t* pred_cu,
diff --git a/tests/mv_cand_tests.c b/tests/mv_cand_tests.c
index 84ab9328..849fec2d 100644
--- a/tests/mv_cand_tests.c
+++ b/tests/mv_cand_tests.c
@@ -46,8 +46,11 @@ TEST test_get_spatial_merge_cand(void)
 
   merge_candidates_t cand = { 0 };
 
-  get_spatial_merge_candidates(64 + 32, 64, // x, y
-                               32, 24,      // width, height
+  cu_loc_t cu_loc;
+  uvg_cu_loc_ctor(&cu_loc, 64 + 32, 64, // x, y
+    32, 24); // width, height)
+
+  get_spatial_merge_candidates(&cu_loc,      
                                1920, 1080,  // picture size
                                &lcu,
                                &cand,

From dcf879e5ed52d56ca29c3c213d70fbed6d3bd3db Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 7 Sep 2022 16:11:36 +0300
Subject: [PATCH 091/254] [mtt] remove all rest usages of deriving width and
 height from depth

---
 src/cu.c                             |   2 +
 src/cu.h                             |   6 +-
 src/encode_coding_tree.c             | 117 +++++++++--------
 src/encode_coding_tree.h             |  15 +--
 src/encoderstate.c                   |   9 +-
 src/intra.c                          |  72 ++++++-----
 src/intra.h                          |   9 +-
 src/search.c                         |  72 +++++------
 src/search_intra.c                   | 180 +++++++++++++--------------
 src/search_intra.h                   |  10 +-
 src/strategies/generic/dct-generic.c |   4 +-
 src/transform.c                      |   7 +-
 src/transform.h                      |   1 -
 13 files changed, 257 insertions(+), 247 deletions(-)

diff --git a/src/cu.c b/src/cu.c
index 3a0f03fa..aedf341c 100644
--- a/src/cu.c
+++ b/src/cu.c
@@ -306,6 +306,8 @@ void uvg_cu_loc_ctor(cu_loc_t* loc, int x, int y, int width, int height)
   
   loc->x = x;
   loc->y = y;
+  loc->local_x = x % LCU_WIDTH;
+  loc->local_y = y % LCU_WIDTH;
   loc->width = width;
   loc->height = height;
   // TODO: when MTT is implemented, chroma dimensions can be minimum 2.
diff --git a/src/cu.h b/src/cu.h
index 1d49d347..dfad7861 100644
--- a/src/cu.h
+++ b/src/cu.h
@@ -119,7 +119,7 @@ typedef struct
 {
   uint8_t type        : 3; //!< \brief block type, one of cu_type_t values
   uint8_t depth       : 3; //!< \brief depth / size of this block
-  uint8_t tr_depth    : 3; //!< \brief transform depth
+  uint8_t tr_depth    ; //!< \brief transform depth
   uint8_t skipped     : 1; //!< \brief flag to indicate this block is skipped
   uint8_t merged      : 1; //!< \brief flag to indicate this block is merged
   uint8_t merge_idx   : 3; //!< \brief merge index
@@ -129,6 +129,8 @@ typedef struct
 
   uint16_t cbf;
 
+  uint32_t split_tree : 3 * 9;
+
   /**
    * \brief QP used for the CU.
    *
@@ -170,6 +172,8 @@ typedef struct
 typedef struct {
   int16_t x;
   int16_t y;
+  uint8_t local_x;
+  uint8_t local_y;
   int8_t width;
   int8_t height;
   int8_t chroma_width;
diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index 6f6fc9d8..fcb6d308 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -660,7 +660,7 @@ static void encode_transform_coeff(
   bool last_split,
   bool can_skip_last_cbf,
   int *luma_cbf_ctx,            // Always true except when writing sub partition coeffs (ISP)
-  cu_loc_t *original_loc)       // Original dimensions before ISP split
+  const cu_loc_t * const original_loc)       // Original dimensions before ISP split
 {
   cabac_data_t * const cabac = &state->cabac;
   int x = cu_loc->x;
@@ -829,7 +829,6 @@ int uvg_encode_inter_prediction_unit(
   encoder_state_t * const state,
   cabac_data_t * const cabac,
   const cu_info_t * const cur_cu,
-  int depth,
   lcu_t* lcu,
   double* bits_out,
   const cu_loc_t* const cu_loc)
@@ -867,7 +866,7 @@ int uvg_encode_inter_prediction_unit(
       // Code Inter Dir
       uint8_t inter_dir = cur_cu->inter.mv_dir;
 
-      if ((LCU_WIDTH >> depth) != 4) { // ToDo: limit on 4x8/8x4
+      if (cu_loc->width + cu_loc->height > 12) { // ToDo: limit on 4x8/8x4
         uint32_t inter_dir_ctx = (7 - ((uvg_math_floor_log2(cu_loc->width) + uvg_math_floor_log2(cu_loc->height) + 1) >> 1));
 
         CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.inter_dir[inter_dir_ctx]), (inter_dir == 3), bits, "inter_pred_idc");
@@ -1038,10 +1037,13 @@ static void encode_chroma_intra_cu(
   else if (cabac->only_count && bits_out)*bits_out += bits;
 }
 
-void uvg_encode_intra_luma_coding_unit(const encoder_state_t * const state,
-                                     cabac_data_t * const cabac,
-                                     const cu_info_t * const cur_cu,
-                                     int x, int y, int depth, const lcu_t* lcu, double* bits_out)
+void uvg_encode_intra_luma_coding_unit(
+  const encoder_state_t * const state,
+  cabac_data_t * const cabac,
+  const cu_info_t * const cur_cu,
+  const cu_loc_t* const cu_loc,
+  const lcu_t* lcu,
+  double* bits_out)
 {
   const videoframe_t * const frame = state->tile->frame;
   uint8_t intra_pred_mode_actual;
@@ -1053,6 +1055,9 @@ void uvg_encode_intra_luma_coding_unit(const encoder_state_t * const state,
   uint32_t flag;
   double bits = 0;
 
+  const int x = cu_loc->x;
+  const int y = cu_loc->y;
+
   /*
   if ((cur_cu->type == CU_INTRA && (LCU_WIDTH >> cur_cu->depth <= 32))) {
     cabac->cur_ctx = &(cabac->ctx.bdpcm_mode[0]);
@@ -1076,8 +1081,8 @@ void uvg_encode_intra_luma_coding_unit(const encoder_state_t * const state,
   }
   */
   
-  uint32_t width = (LCU_WIDTH >> depth);
-  uint32_t height = (LCU_WIDTH >> depth); // TODO: height for non-square blocks
+  uint32_t width = cu_loc->width;
+  uint32_t height = cu_loc->height; // TODO: height for non-square blocks
 
   // Code MIP related bits
   bool enable_mip = state->encoder_control->cfg.mip;
@@ -1102,9 +1107,7 @@ void uvg_encode_intra_luma_coding_unit(const encoder_state_t * const state,
   }
 
   if (cur_cu->type == CU_INTRA && !cur_cu->bdpcmMode && enable_mip) {
-    const int cu_width = LCU_WIDTH >> depth;
-    const int cu_height = cu_width; // TODO: height for non-square blocks
-    uint8_t ctx_id = uvg_get_mip_flag_context(x, y, cu_width, cu_height, lcu, lcu ? NULL : frame->cu_array);
+    uint8_t ctx_id = uvg_get_mip_flag_context(cu_loc, lcu, lcu ? NULL : frame->cu_array);
 
     // Write MIP flag
     CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.mip_flag[ctx_id]), mip_flag, bits, "mip_flag");
@@ -1149,8 +1152,7 @@ void uvg_encode_intra_luma_coding_unit(const encoder_state_t * const state,
       CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.intra_subpart_model[1]), isp_mode - 1, bits, "intra_subpartitions_split_type"); // Vertical or horizontal split
     }
   }
-
-  const int cu_width = LCU_WIDTH >> depth;
+  
     // PREDINFO CODING
     // If intra prediction mode is found from the predictors,
     // it can be signaled with two EP's. Otherwise we can send
@@ -1165,7 +1167,7 @@ void uvg_encode_intra_luma_coding_unit(const encoder_state_t * const state,
   if (x > 0) {
     assert(x >> 2 > 0);
     const int x_scu = SUB_SCU(x) - 1;
-    const int y_scu = SUB_SCU(y + cu_width - 1);
+    const int y_scu = SUB_SCU(y + height - 1);
     left_pu = lcu ?
                 LCU_GET_CU_AT_PX(
                   lcu,
@@ -1174,7 +1176,7 @@ void uvg_encode_intra_luma_coding_unit(const encoder_state_t * const state,
                 uvg_cu_array_at_const(
                   frame->cu_array,
                   x - 1,
-                  y + cu_width - 1);
+                  y + height - 1);
   }
   // Don't take the above PU across the LCU boundary.
   if (y % LCU_WIDTH > 0 && y > 0) {
@@ -1182,11 +1184,11 @@ void uvg_encode_intra_luma_coding_unit(const encoder_state_t * const state,
     above_pu = lcu ?
                  LCU_GET_CU_AT_PX(
                    lcu,
-                   SUB_SCU(x + cu_width - 1),
+                   SUB_SCU(x + width - 1),
                    SUB_SCU(y) - 1) :
                  uvg_cu_array_at_const(
                    frame->cu_array,
-                   x + cu_width - 1,
+                   x + width - 1,
                    y - 1);
   }
   
@@ -1405,28 +1407,25 @@ bool uvg_write_split_flag(
 
 void uvg_encode_coding_tree(
   encoder_state_t * const state,
-  uint16_t x,
-  uint16_t y,
-  uint8_t depth,
   lcu_coeff_t *coeff,
-  enum uvg_tree_type tree_type)
+  enum uvg_tree_type tree_type,
+  const cu_loc_t* const cu_loc,
+  const split_tree_t split_tree)
 {
   cabac_data_t * const cabac = &state->cabac;
   const encoder_control_t * const ctrl = state->encoder_control;
   const videoframe_t * const frame = state->tile->frame;
   const cu_array_t* used_array = tree_type != UVG_CHROMA_T ? frame->cu_array : frame->chroma_cu_array;
-  const cu_info_t *cur_cu   = uvg_cu_array_at_const(used_array, x, y);
-
-  const int width  = LCU_WIDTH >> depth;
-  const int height = width; // TODO: height for non-square blocks
-  cu_loc_t cu_loc;
-  uvg_cu_loc_ctor(&cu_loc, x, y, width, height);
-
-  const int cu_width  = tree_type != UVG_CHROMA_T ? cu_loc.width : cu_loc.chroma_width;
-  const int cu_height = tree_type != UVG_CHROMA_T ? cu_loc.height : cu_loc.chroma_height;
+  const cu_info_t *cur_cu   = uvg_cu_array_at_const(used_array, cu_loc->x, cu_loc->y);
+  
+  const int cu_width  = tree_type != UVG_CHROMA_T ? cu_loc->width : cu_loc->chroma_width;
+  const int cu_height = tree_type != UVG_CHROMA_T ? cu_loc->height : cu_loc->chroma_height;
   const int half_cu  = cu_width >> 1;
 
-  
+  const int x = cu_loc->x;
+  const int y = cu_loc->y;
+
+  const int depth = split_tree.current_depth;
 
   const cu_info_t *left_cu  = NULL;
   if (x > 0) {
@@ -1458,33 +1457,33 @@ void uvg_encode_coding_tree(
   // When not in MAX_DEPTH, insert split flag and split the blocks if needed
   if (depth != MAX_DEPTH && !(tree_type == UVG_CHROMA_T && depth == MAX_DEPTH -1)) {
 
-    const int split_flag = uvg_write_split_flag(state, cabac, left_cu, above_cu, GET_SPLITDATA(cur_cu, depth), depth, cu_width, x, y, tree_type,NULL);
+    const int split_flag = uvg_write_split_flag(state, cabac, left_cu, above_cu, (cur_cu->split_tree >> (split_tree.current_depth * 3)) & 7, depth, cu_width, x, y, tree_type,NULL);
     
     if (split_flag || border) {
+      const int half_luma = cu_loc->width / 2;
+      split_tree_t new_split_tree = { cur_cu->split_tree, split_tree.current_depth + 1 };
+
+      cu_loc_t new_cu_loc;
+      uvg_cu_loc_ctor(&new_cu_loc, x, y, half_luma, half_luma);
       // Split blocks and remember to change x and y block positions
-      uvg_encode_coding_tree(state, x, y, depth + 1, coeff, tree_type);
+      uvg_encode_coding_tree(state, coeff, tree_type, &new_cu_loc, new_split_tree);
 
       if (!border_x || border_split_x) {
-        uvg_encode_coding_tree(state, x + half_cu, y, depth + 1, coeff, tree_type);
+        uvg_cu_loc_ctor(&new_cu_loc, x + half_cu, y, half_luma, half_luma);
+        uvg_encode_coding_tree(state, coeff, tree_type, &new_cu_loc, new_split_tree);
       }
       if (!border_y || border_split_y) {
-        uvg_encode_coding_tree(state, x, y + half_cu, depth + 1, coeff, tree_type);
+        uvg_cu_loc_ctor(&new_cu_loc, x, y + half_cu, half_luma, half_luma);
+        uvg_encode_coding_tree(state, coeff, tree_type, &new_cu_loc, new_split_tree);
       }
       if (!border || (border_split_x && border_split_y)) {
-        uvg_encode_coding_tree(state, x + half_cu, y + half_cu, depth + 1, coeff, tree_type);
+        uvg_cu_loc_ctor(&new_cu_loc, x + half_cu, y + half_cu, half_luma, half_luma);
+        uvg_encode_coding_tree(state, coeff, tree_type, &new_cu_loc, new_split_tree);
       }
       return;
     }
   }
-
-  //ToDo: check if we can actually split
-  //ToDo: Implement MT split
-  if (depth < MAX_PU_DEPTH)
-  {
-   // cabac->cur_ctx = &(cabac->ctx.trans_subdiv_model[5 - ((uvg_g_convert_to_bit[LCU_WIDTH] + 2) - depth)]);
-   // CABAC_BIN(cabac, 0, "split_transform_flag");
-  }
-
+  
   DBG_YUVIEW_VALUE(state->frame->poc, DBG_YUVIEW_CU_TYPE, abs_x, abs_y, cu_width, cu_width, cur_cu->type-1);
 
   if (ctrl->cfg.lossless) {
@@ -1519,8 +1518,8 @@ void uvg_encode_coding_tree(
         cabac->cur_ctx = &(cabac->ctx.ibc_flag[ctx_ibc]);
         CABAC_BIN(cabac, (cur_cu->type == CU_IBC), "IBCFlag");
       }
-      DBG_PRINT_MV(state, x, y, (uint32_t)cu_width, (uint32_t)cu_width, cur_cu);
-      uvg_hmvp_add_mv(state, x, y, (uint32_t)cu_width, (uint32_t)cu_width, cur_cu);
+      DBG_PRINT_MV(state, x, y, (uint32_t)cu_width, (uint32_t)cu_height, cur_cu);
+      uvg_hmvp_add_mv(state, x, y, cu_width, cu_height, cur_cu);
       int16_t num_cand = state->encoder_control->cfg.max_merge;
       if (num_cand > 1) {
         for (int ui = 0; ui < num_cand - 1; ui++) {
@@ -1555,7 +1554,7 @@ void uvg_encode_coding_tree(
     CABAC_BIN(cabac, (cur_cu->type == CU_IBC), "IBCFlag");
   }
 
-  if (state->frame->slicetype != UVG_SLICE_I && cu_width != 4) {
+  if (state->frame->slicetype != UVG_SLICE_I && cu_width != 4 && cu_height != 4)  {
 
     int8_t ctx_predmode = 0;
 
@@ -1629,11 +1628,11 @@ void uvg_encode_coding_tree(
     bool non_zero_mvd = false;
   
     // TODO: height for non-square blocks
-    const cu_info_t *cur_pu = uvg_cu_array_at_const(used_array, cu_loc.x, cu_loc.y);
+    const cu_info_t *cur_pu = uvg_cu_array_at_const(used_array, cu_loc->x, cu_loc->y);
 
-    non_zero_mvd |= uvg_encode_inter_prediction_unit(state, cabac, cur_pu, depth, NULL, NULL, &cu_loc);
-    DBG_PRINT_MV(state, pu_x, pu_y, pu_w, pu_h, cur_pu);
-    uvg_hmvp_add_mv(state, x, y, width, height, cur_pu);
+    non_zero_mvd |= uvg_encode_inter_prediction_unit(state, cabac, cur_pu, NULL, NULL, cu_loc);
+    DBG_PRINT_MV(state, x, y, cu_width, cu_height, cur_pu);
+    uvg_hmvp_add_mv(state, x, y, cu_width, cu_height, cur_pu);
     
 
     // imv mode, select between fullpel, half-pel and quarter-pel resolutions
@@ -1662,7 +1661,7 @@ void uvg_encode_coding_tree(
       // Code (possible) coeffs to bitstream
       if (cbf) {
         int luma_cbf_ctx = 0;
-        encode_transform_coeff(state, &cu_loc, depth, 0, 0, 0, 0, coeff, tree_type, true, false, &luma_cbf_ctx, &cu_loc);
+        encode_transform_coeff(state, &cu_loc, depth, 0, 0, 0, 0, coeff, tree_type, true, false, &luma_cbf_ctx, cu_loc);
       }
 
       encode_mts_idx(state, cabac, cur_cu);
@@ -1670,7 +1669,7 @@ void uvg_encode_coding_tree(
     }
   } else if (cur_cu->type == CU_INTRA) {
     if(tree_type != UVG_CHROMA_T) {
-      uvg_encode_intra_luma_coding_unit(state, cabac, cur_cu, x, y, depth, NULL, NULL);
+      uvg_encode_intra_luma_coding_unit(state, cabac, cur_cu, cu_loc, NULL, NULL);
     }
 
     // Code chroma prediction mode.
@@ -1694,7 +1693,7 @@ void uvg_encode_coding_tree(
 
         // Check if last split to write chroma
         bool last_split = (i + 1) == split_limit;
-        encode_transform_coeff(state, &split_loc, depth, 0, 0, 0, 0, coeff, tree_type, last_split, can_skip_last_cbf, &luma_cbf_ctx, &cu_loc);
+        encode_transform_coeff(state, &split_loc, depth, 0, 0, 0, 0, coeff, tree_type, last_split, can_skip_last_cbf, &luma_cbf_ctx, cu_loc);
         can_skip_last_cbf &= luma_cbf_ctx == 2;
       }
     }
@@ -1714,7 +1713,7 @@ void uvg_encode_coding_tree(
       tmp->violates_lfnst_constrained_luma = false;
       tmp->violates_lfnst_constrained_chroma = false;
       tmp->lfnst_last_scan_pos = false;
-      encode_transform_coeff(state, &cu_loc, depth, 0, 0, 0, 1, coeff, tree_type, true, false, &luma_cbf_ctx, &cu_loc);
+      encode_transform_coeff(state, &cu_loc, depth, 0, 0, 0, 1, coeff, tree_type, true, false, &luma_cbf_ctx, cu_loc);
       // Write LFNST only once for single tree structure
       encode_lfnst_idx(state, cabac, tmp, x, y, depth, cu_width, cu_height, tree_type, COLOR_UV);
     }
@@ -1843,7 +1842,7 @@ double uvg_mock_encode_coding_unit(
   
   if (cur_cu->type == CU_INTER || cur_cu->type == CU_IBC) {
     const uint8_t imv_mode = UVG_IMV_OFF;
-    const int non_zero_mvd = uvg_encode_inter_prediction_unit(state, cabac, cur_cu, depth, lcu, &bits, cu_loc);
+    const int non_zero_mvd = uvg_encode_inter_prediction_unit(state, cabac, cur_cu, lcu, &bits, cu_loc);
     if (ctrl->cfg.amvr && non_zero_mvd) {
       CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.imv_flag[0]), imv_mode, bits, "imv_flag");
       if (imv_mode > UVG_IMV_OFF) {
@@ -1856,7 +1855,7 @@ double uvg_mock_encode_coding_unit(
   }
   else if (cur_cu->type == CU_INTRA) {
     if(tree_type != UVG_CHROMA_T) {
-      uvg_encode_intra_luma_coding_unit(state, cabac, cur_cu, x, y, depth, lcu, &bits);
+      uvg_encode_intra_luma_coding_unit(state, cabac, cur_cu, cu_loc, lcu, &bits);
     }
     if((depth != 4 || (x % 8 != 0 && y % 8 != 0)) && state->encoder_control->chroma_format != UVG_CSP_400 && tree_type != UVG_LUMA_T) {
       encode_chroma_intra_cu(cabac, cur_cu, state->encoder_control->cfg.cclm, &bits);
diff --git a/src/encode_coding_tree.h b/src/encode_coding_tree.h
index 231e22ff..6c0c2cd1 100644
--- a/src/encode_coding_tree.h
+++ b/src/encode_coding_tree.h
@@ -54,11 +54,10 @@ bool uvg_is_lfnst_allowed(
 
 void uvg_encode_coding_tree(
   encoder_state_t * const state,
-  uint16_t x_ctb,
-  uint16_t y_ctb,
-  uint8_t depth,
   lcu_coeff_t *coeff,
-  enum uvg_tree_type tree_type);
+  enum uvg_tree_type tree_type,
+  const cu_loc_t* const cu_loc,
+  const split_tree_t split_tree);
 
 void uvg_encode_ts_residual(encoder_state_t* const state,
   cabac_data_t* const cabac,
@@ -87,15 +86,17 @@ int uvg_encode_inter_prediction_unit(
   encoder_state_t* const state,
   cabac_data_t* const cabac,
   const cu_info_t* const cur_cu,
-  int depth,
   lcu_t* lcu,
   double* bits_out,
   const cu_loc_t* const cu_loc);
 
-void uvg_encode_intra_luma_coding_unit(const encoder_state_t* const state,
+void uvg_encode_intra_luma_coding_unit(
+  const encoder_state_t* const state,
   cabac_data_t* const cabac,
   const cu_info_t* const cur_cu,
-  int x, int y, int depth, const lcu_t* lcu, double* bits_out);
+  const cu_loc_t* const cu_loc,
+  const lcu_t* lcu,
+  double* bits_out);
 
 
 bool uvg_write_split_flag(
diff --git a/src/encoderstate.c b/src/encoderstate.c
index cdadccf4..dc3416e3 100644
--- a/src/encoderstate.c
+++ b/src/encoderstate.c
@@ -870,10 +870,15 @@ static void encoder_state_worker_encode_lcu_bitstream(void * opaque)
 
   enum uvg_tree_type tree_type = state->frame->slicetype == UVG_SLICE_I && state->encoder_control->cfg.dual_tree ? UVG_LUMA_T : UVG_BOTH_T;
   //Encode coding tree
-  uvg_encode_coding_tree(state, lcu->position.x * LCU_WIDTH, lcu->position.y * LCU_WIDTH, 0, lcu->coeff, tree_type);
+  cu_loc_t start;
+  uvg_cu_loc_ctor(&start, lcu->position.x * LCU_WIDTH, lcu->position.y * LCU_WIDTH, LCU_WIDTH, LCU_WIDTH);
+  split_tree_t split_tree = { 0, 0 };
+
+  uvg_encode_coding_tree(state, lcu->coeff, tree_type, &start, split_tree);
 
   if(tree_type == UVG_LUMA_T && state->encoder_control->chroma_format != UVG_CSP_400) {
-    uvg_encode_coding_tree(state, lcu->position.x * LCU_WIDTH_C, lcu->position.y * LCU_WIDTH_C, 0, lcu->coeff, UVG_CHROMA_T);
+    uvg_cu_loc_ctor(&start, lcu->position.x * LCU_WIDTH_C, lcu->position.y * LCU_WIDTH_C, LCU_WIDTH, LCU_WIDTH);
+    uvg_encode_coding_tree(state, lcu->coeff, UVG_CHROMA_T, &start, split_tree);
   }
 
   if (!state->cabac.only_count) {
diff --git a/src/intra.c b/src/intra.c
index 75f0c3a4..764ac072 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -585,12 +585,18 @@ static void predict_cclm(
 }
 
 
-int uvg_get_mip_flag_context(int x, int y, int width, int height, const lcu_t* lcu, cu_array_t* const cu_a) {
+uint8_t uvg_get_mip_flag_context(
+  const cu_loc_t* const cu_loc,
+  const lcu_t* lcu,
+  cu_array_t* const cu_a) {
   assert(!(lcu && cu_a));
-  if (width > 2 * height || height > 2 * width) {
+  if (cu_loc->width > 2 * cu_loc->height || cu_loc->height > 2 * cu_loc->width) {
     return 3;
   }
-  
+
+  const int x = cu_loc->x;
+  const int y = cu_loc->y;
+
   int context = 0;
   const cu_info_t* left = NULL;
   const cu_info_t* top = NULL;
@@ -1761,26 +1767,26 @@ static void intra_recon_tb_leaf(
  */
 void uvg_intra_recon_cu(
   encoder_state_t* const state,
-  int x,
-  int y,
-  int depth,
   intra_search_data_t* search_data,
+  const cu_loc_t* cu_loc,
   cu_info_t *cur_cu,
   lcu_t *lcu,
   enum uvg_tree_type tree_type,
   bool recon_luma,
   bool recon_chroma)
 {
-  const vector2d_t lcu_px = { SUB_SCU(x) >> (tree_type == UVG_CHROMA_T), SUB_SCU(y) >> (tree_type == UVG_CHROMA_T) };
-  const int8_t width = LCU_WIDTH >> depth;
-  const int8_t height = width; // TODO: height for non-square blocks.
+  const uint8_t depth = 6 - uvg_g_convert_to_log2[cu_loc->width];
+  const vector2d_t lcu_px = { cu_loc->local_x >> (tree_type == UVG_CHROMA_T), cu_loc->local_y >> (tree_type == UVG_CHROMA_T) };
+  const int8_t width = cu_loc->width;
+  const int8_t height = cu_loc->height; // TODO: height for non-square blocks.
   if (cur_cu == NULL) {
     cur_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y);
   }
 
+  cu_loc_t chroma_cu_loc;
   if(!recon_luma && recon_chroma) {
-    x &= ~7;
-    y &= ~7;
+    uvg_cu_loc_ctor(&chroma_cu_loc, cu_loc->x & ~7, cu_loc->y & ~7, width, height);
+    cu_loc = &chroma_cu_loc;
   }
   
   // Reset CBFs because CBFs might have been set
@@ -1793,22 +1799,25 @@ void uvg_intra_recon_cu(
     cbf_clear(&cur_cu->cbf, depth, COLOR_V);
   }
 
-  if (depth == 0 || cur_cu->tr_depth > depth) {
+  if (width > TR_MAX_WIDTH || height > TR_MAX_WIDTH) {
+    cu_loc_t split_cu_loc;
 
-    const int offset = width / 2;
-    const int32_t x2 = x + offset;
-    const int32_t y2 = y + offset;
-
-    uvg_intra_recon_cu(state, x,   y,   depth + 1, search_data, NULL, lcu, tree_type, recon_luma, recon_chroma);
-    uvg_intra_recon_cu(state, x2,  y,   depth + 1, search_data, NULL, lcu, tree_type, recon_luma, recon_chroma);
-    uvg_intra_recon_cu(state, x,   y2,  depth + 1, search_data, NULL, lcu, tree_type, recon_luma, recon_chroma);
-    uvg_intra_recon_cu(state, x2,  y2,  depth + 1, search_data, NULL, lcu, tree_type, recon_luma, recon_chroma);
+    const int half_width = width / 2;
+    const int half_height = height / 2;
+    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x, cu_loc->y, half_width, half_height);
+    uvg_intra_recon_cu(state, search_data, &split_cu_loc, NULL, lcu, tree_type, recon_luma, recon_chroma);
+    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x + half_width, cu_loc->y, half_width, half_height);
+    uvg_intra_recon_cu(state, search_data, &split_cu_loc, NULL, lcu, tree_type, recon_luma, recon_chroma);
+    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x, cu_loc->y + half_height, half_width, half_height);
+    uvg_intra_recon_cu(state, search_data, &split_cu_loc, NULL, lcu, tree_type, recon_luma, recon_chroma);
+    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x + half_width, cu_loc->y + half_height, half_width, half_height);
+    uvg_intra_recon_cu(state, search_data, &split_cu_loc, NULL, lcu, tree_type, recon_luma, recon_chroma);
 
     // Propagate coded block flags from child CUs to parent CU.
     uint16_t child_cbfs[3] = {
-      LCU_GET_CU_AT_PX(lcu, (lcu_px.x + offset) >> (tree_type == UVG_CHROMA_T), lcu_px.y >> (tree_type == UVG_CHROMA_T))->cbf,
-      LCU_GET_CU_AT_PX(lcu, lcu_px.x >> (tree_type == UVG_CHROMA_T), (lcu_px.y + offset) >> (tree_type == UVG_CHROMA_T))->cbf,
-      LCU_GET_CU_AT_PX(lcu, (lcu_px.x + offset) >> (tree_type == UVG_CHROMA_T), (lcu_px.y + offset) >> (tree_type == UVG_CHROMA_T))->cbf,
+      LCU_GET_CU_AT_PX(lcu, (lcu_px.x + half_width) >> (tree_type == UVG_CHROMA_T), lcu_px.y >> (tree_type == UVG_CHROMA_T))->cbf,
+      LCU_GET_CU_AT_PX(lcu, lcu_px.x >> (tree_type == UVG_CHROMA_T), (lcu_px.y + half_height) >> (tree_type == UVG_CHROMA_T))->cbf,
+      LCU_GET_CU_AT_PX(lcu, (lcu_px.x + half_width) >> (tree_type == UVG_CHROMA_T), (lcu_px.y + half_height) >> (tree_type == UVG_CHROMA_T))->cbf,
     };
 
     if (recon_luma && depth <= MAX_DEPTH) {
@@ -1826,8 +1835,6 @@ void uvg_intra_recon_cu(
     // Small blocks are split only twice.
     int split_type = search_data->pred_cu.intra.isp_mode;
     int split_limit = uvg_get_isp_split_num(width, height, split_type, true);
-    cu_loc_t origin_cu;
-    uvg_cu_loc_ctor(&origin_cu, x, y, width, height);
 
     for (int i = 0; i < split_limit; ++i) {
       cu_loc_t tu_loc;
@@ -1845,24 +1852,21 @@ void uvg_intra_recon_cu(
     }
   }
   const bool has_luma = recon_luma && search_data->pred_cu.intra.isp_mode == ISP_MODE_NO_ISP;
-  const bool has_chroma = recon_chroma && (x % 8 == 0 && y % 8 == 0);
-
-  cu_loc_t loc;
-  uvg_cu_loc_ctor(&loc, x, y, width, height);
-   
+  const bool has_chroma = recon_chroma && (cu_loc->x % 8 == 0 && cu_loc->y % 8 == 0);
+     
   // Process a leaf TU.
   if (has_luma) {
-    intra_recon_tb_leaf(state, &loc, &loc, lcu, COLOR_Y, search_data, tree_type);
+    intra_recon_tb_leaf(state, cu_loc, cu_loc, lcu, COLOR_Y, search_data, tree_type);
   }
   if (has_chroma) {
-    intra_recon_tb_leaf(state, &loc, &loc, lcu, COLOR_U, search_data, tree_type);
-    intra_recon_tb_leaf(state, &loc, &loc, lcu, COLOR_V, search_data, tree_type);
+    intra_recon_tb_leaf(state, cu_loc, cu_loc, lcu, COLOR_U, search_data, tree_type);
+    intra_recon_tb_leaf(state, cu_loc, cu_loc, lcu, COLOR_V, search_data, tree_type);
   }
 
   // TODO: not necessary to call if only luma and ISP is on
   uvg_quantize_lcu_residual(state, has_luma, has_chroma && !(search_data->pred_cu.joint_cb_cr & 3),
                             search_data->pred_cu.joint_cb_cr & 3 && state->encoder_control->cfg.jccr && has_chroma,
-                            &loc, depth, cur_cu, lcu,
+                            cu_loc, depth, cur_cu, lcu,
                             false, tree_type);
 }
 
diff --git a/src/intra.h b/src/intra.h
index c4bdc87e..deeb173d 100644
--- a/src/intra.h
+++ b/src/intra.h
@@ -142,10 +142,8 @@ void uvg_intra_predict(
 
 void uvg_intra_recon_cu(
   encoder_state_t* const state,
-  int x,
-  int y,
-  int depth,
   intra_search_data_t* search_data,
+  const cu_loc_t* cu_loc,
   cu_info_t *cur_cu,
   lcu_t *lcu,
   enum uvg_tree_type tree_type,
@@ -161,7 +159,10 @@ const cu_info_t* uvg_get_co_located_luma_cu(
   const cu_array_t* const cu_array,
   enum uvg_tree_type tree_type);
 
-int uvg_get_mip_flag_context(int x, int y, int width, int height, const lcu_t* lcu, cu_array_t* const cu_a);
+uint8_t uvg_get_mip_flag_context(
+  const cu_loc_t* const cu_loc,
+  const lcu_t* lcu,
+  cu_array_t* const cu_a);
 
 // ISP related defines
 #define NUM_ISP_MODES 3
diff --git a/src/search.c b/src/search.c
index d61be039..1d992077 100644
--- a/src/search.c
+++ b/src/search.c
@@ -761,16 +761,17 @@ static double cu_rd_cost_tr_split_accurate(
 
 
 // Return estimate of bits used to code prediction mode of cur_cu.
-static double calc_mode_bits(const encoder_state_t *state,
-                             const lcu_t *lcu,
-                             const cu_info_t * cur_cu,
-                             int x, int y, int depth)
+static double calc_mode_bits(
+  const encoder_state_t *state,
+  const lcu_t *lcu,
+  const cu_info_t * cur_cu,
+  const cu_loc_t* const cu_loc)
 {
   assert(cur_cu->type == CU_INTRA);
 
-  double mode_bits = uvg_luma_mode_bits(state, cur_cu, x, y, depth, lcu);
+  double mode_bits = uvg_luma_mode_bits(state, cur_cu, cu_loc, lcu);
 
-  if (((depth == 4 && x % 8 && y % 8) || (depth != 4)) && state->encoder_control->chroma_format != UVG_CSP_400) {
+  if (((cu_loc->width == 4 && cu_loc->x % 8 && cu_loc->y % 8) || (cu_loc->width != 4)) && state->encoder_control->chroma_format != UVG_CSP_400) {
     mode_bits += uvg_chroma_mode_bits(state, cur_cu->intra.mode_chroma, cur_cu->intra.mode);
   }
 
@@ -945,6 +946,7 @@ static double search_cu(
   cur_cu->lfnst_last_scan_pos = 0;
   cur_cu->lfnst_idx = 0;
   cur_cu->joint_cb_cr = 0;
+  cur_cu->split_tree = split_tree.split_tree;
 
   // If the CU is completely inside the frame at this depth, search for
   // prediction modes at this depth.
@@ -1001,9 +1003,7 @@ static double search_cu(
       intra_search.pred_cu = *cur_cu;
       if(tree_type != UVG_CHROMA_T) {
         intra_search.pred_cu.joint_cb_cr = 4;
-        uvg_search_cu_intra(state, x, y, depth, &intra_search,
-                            lcu,
-                            tree_type);
+        uvg_search_cu_intra(state, &intra_search, lcu, tree_type, cu_loc);
       }
 #ifdef COMPLETE_PRED_MODE_BITS
       // Technically counting these bits would be correct, however counting
@@ -1017,10 +1017,11 @@ static double search_cu(
 #endif
       if (state->encoder_control->cfg.cclm && tree_type != UVG_CHROMA_T && state->encoder_control->chroma_format != UVG_CSP_400) {
         uvg_intra_recon_cu(state,
-          x, y,
-          depth, &intra_search,
-          &intra_search.pred_cu,
-          lcu, tree_type, true, false);
+                           &intra_search, cu_loc,
+                           &intra_search.pred_cu, lcu,
+                           tree_type,
+                           true,
+                           false);
 
         downsample_cclm_rec(
           state, x, y, cu_width / 2, cu_width / 2, lcu->rec.y, lcu->left_ref.y[64]
@@ -1058,14 +1059,13 @@ static double search_cu(
           else {
             intra_search.pred_cu.intra.mode_chroma = 0;
           }
-
-          if(tree_type != UVG_CHROMA_T && ctrl->cfg.rdo >= 2) {
-            uvg_intra_recon_cu(state,
-              x, y,
-              depth, &intra_search,
-              &intra_search.pred_cu,
-              lcu,
-              tree_type, false, true);
+          uvg_intra_recon_cu(state,
+                             &intra_search, cu_loc,
+                             &intra_search.pred_cu, lcu,
+                             tree_type,
+                             false,
+                             true);
+          if(tree_type != UVG_CHROMA_T) {
             intra_cost += uvg_cu_rd_cost_chroma(state, x_local, y_local, depth, &intra_search.pred_cu, lcu);
           }
           else {
@@ -1128,20 +1128,20 @@ static double search_cu(
       }
       lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu);
       uvg_intra_recon_cu(state,
-                         x, y,
-                         depth, &intra_search,
-                         NULL, 
-                         lcu, tree_type,recon_luma,recon_chroma);
+                         &intra_search, cu_loc,
+                         NULL, lcu,
+                         tree_type, 
+                         recon_luma, recon_chroma);
 
 
       if(split_tree.current_depth == 4 && x % 8 && y % 8 && tree_type != UVG_LUMA_T && state->encoder_control->chroma_format != UVG_CSP_400) {
         intra_search.pred_cu.intra.mode_chroma = cur_cu->intra.mode_chroma;
         uvg_intra_recon_cu(state,
-                           x, y,
-                           depth, &intra_search,
-                           NULL,
-                           lcu,
-                           tree_type,false,true);
+                           &intra_search, cu_loc,
+                           NULL, lcu,
+                           tree_type,
+                           false,
+                           true);
       }
       if (cur_cu->joint_cb_cr == 4) cur_cu->joint_cb_cr = 0;
 
@@ -1334,7 +1334,7 @@ static double search_cu(
     // It is ok to interrupt the search as soon as it is known that
     // the split costs at least as much as not splitting.
     if (cur_cu->type == CU_NOTSET || cbf || state->encoder_control->cfg.cu_split_termination == UVG_CU_SPLIT_TERMINATION_OFF) {
-      const split_tree_t new_split = { split_tree.split_tree | QT_SPLIT << split_tree.current_depth, split_tree.current_depth + 1};
+      const split_tree_t new_split = { split_tree.split_tree | QT_SPLIT << (split_tree.current_depth * 3), split_tree.current_depth + 1};
       cu_loc_t new_cu_loc;
       if (split_cost < cost) {
         uvg_cu_loc_ctor(&new_cu_loc, x, y, half_cu, half_cu);
@@ -1399,14 +1399,14 @@ static double search_cu(
         proxy.pred_cu = *cur_cu;
 
         uvg_intra_recon_cu(state,
-                           x, y,
-                           depth,
-                           &proxy,
+                           &proxy, cu_loc,
                            NULL,
                            lcu,
-                           tree_type, true, state->encoder_control->chroma_format == UVG_CSP_400);
+                           tree_type,
+                           true,
+                           state->encoder_control->chroma_format == UVG_CSP_400);
 
-        double mode_bits = calc_mode_bits(state, lcu, cur_cu, x, y, depth) + bits;
+        double mode_bits = calc_mode_bits(state, lcu, cur_cu, cu_loc) + bits;
         cost += mode_bits * state->lambda;
 
         cost += cu_rd_cost_tr_split_accurate(state, x_local, y_local, depth, cur_cu, lcu, tree_type, 0);
diff --git a/src/search_intra.c b/src/search_intra.c
index 67424bbf..10c6657d 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -265,23 +265,21 @@ static void derive_mts_constraints(cu_info_t *const pred_cu,
 */
 static double search_intra_trdepth(
   encoder_state_t * const state,
-  int x_px,
-  int y_px,
-  int depth,
+  const cu_loc_t* const cu_loc,
   int max_depth,
   double cost_treshold,
   intra_search_data_t *const search_data,
   lcu_t *const lcu,
   enum uvg_tree_type tree_type)
 {
-  assert(depth >= 0 && depth <= MAX_PU_DEPTH);
 
-  const int width = LCU_WIDTH >> depth;
-  const int height = width; // TODO: height for non-square blocks
-  const int width_c = width > TR_MIN_WIDTH ? width / 2 : width;
-
-  const int offset = width / 2;
-  const vector2d_t lcu_px = { SUB_SCU(x_px), SUB_SCU(y_px) };
+  const uint8_t depth = 6 - uvg_g_convert_to_log2[cu_loc->width];
+  const uint8_t width = cu_loc->width;
+  const uint8_t height = cu_loc->height; // TODO: height for non-square blocks
+  const uint8_t width_c = cu_loc->chroma_width;
+  const uint8_t height_c = cu_loc->chroma_height;
+  
+  const vector2d_t lcu_px = { cu_loc->local_x, cu_loc->local_y };
 
   const bool reconstruct_chroma = false;// (depth != 4 || (depth == 4 && (x_px & 4 && y_px & 4))) && state->encoder_control->chroma_format != UVG_CSP_400;
   cu_info_t* pred_cu = &search_data->pred_cu;
@@ -297,7 +295,7 @@ static double search_intra_trdepth(
   double split_cost = INT32_MAX;
   double nosplit_cost = INT32_MAX;
 
-  if (depth > 0) {
+  if (width <= TR_MAX_WIDTH && height <= TR_MAX_WIDTH) {
     tr_cu->tr_depth = depth;
     pred_cu->tr_depth = depth;
 
@@ -389,15 +387,14 @@ static double search_intra_trdepth(
         
         uvg_intra_recon_cu(
           state,
-          x_px,
-          y_px,
-          depth,
           search_data,
+          cu_loc,
           pred_cu,
           lcu,
           UVG_LUMA_T,
           true,
-          false);
+          false
+          );
         if (pred_cu->intra.isp_mode != ISP_MODE_NO_ISP && search_data->best_isp_cbfs == 0) continue;
 
         if (trafo != 0 && !cbf_is_set(pred_cu->cbf, depth, COLOR_Y)) continue;
@@ -418,7 +415,6 @@ static double search_intra_trdepth(
         if (trafo != MTS_SKIP && end_idx != 0) {
           uvg_derive_lfnst_constraints(
             pred_cu,
-            depth,
             constraints,
             lcu->coeff.y,
             width,
@@ -496,15 +492,14 @@ static double search_intra_trdepth(
         // TODO: Maybe check the jccr mode here also but holy shit is the interface of search_intra_rdo bad currently
         uvg_intra_recon_cu(
           state,
-          x_px,
-          y_px,
-          depth,
           search_data,
+          cu_loc,
           pred_cu,
           lcu,
           UVG_BOTH_T,
           false,
-          true);
+          true
+          );
         best_rd_cost += uvg_cu_rd_cost_chroma(
           state,
           lcu_px.x,
@@ -521,11 +516,10 @@ static double search_intra_trdepth(
                                  pred_cu->lfnst_last_scan_pos};
           uvg_derive_lfnst_constraints(
             pred_cu,
-            depth,
             constraints,
             lcu->coeff.u,
             width_c,
-            width_c,
+            height_c,
             &lcu_px,
             COLOR_U);
           if (constraints[0] || !constraints[1]) {
@@ -534,11 +528,10 @@ static double search_intra_trdepth(
           }
           uvg_derive_lfnst_constraints(
             pred_cu,
-            depth,
             constraints,
             lcu->coeff.u,
             width_c,
-            width_c,
+            height_c,
             &lcu_px,
             COLOR_U);
           if (constraints[0] || !constraints[1]) {
@@ -554,11 +547,11 @@ static double search_intra_trdepth(
       pred_cu->intra.mode_chroma = chroma_mode;
       pred_cu->joint_cb_cr= 4; // TODO: Maybe check the jccr mode here also but holy shit is the interface of search_intra_rdo bad currently
       uvg_intra_recon_cu(state,
-                         x_px, y_px,
-                         depth, search_data,
-                         pred_cu, 
-                         lcu, 
-                         UVG_BOTH_T,false,true);
+                         search_data, cu_loc,
+                         pred_cu, lcu,
+                         UVG_BOTH_T,
+                         false,
+                         true);
       best_rd_cost += uvg_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu);
       pred_cu->intra.mode = luma_mode;
     }
@@ -610,17 +603,25 @@ static double search_intra_trdepth(
   //     max_depth.
   // - Min transform size hasn't been reached (MAX_PU_DEPTH).
   if (depth < max_depth && depth < MAX_PU_DEPTH) {
+    cu_loc_t split_cu_loc;
+
+    const int half_width = width / 2;
+    const int half_height = height / 2;
     split_cost = 0;
 
-    split_cost += search_intra_trdepth(state, x_px, y_px, depth + 1, max_depth, nosplit_cost, search_data, lcu, tree_type);
+    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x, cu_loc->y, half_width, half_height);
+    split_cost += search_intra_trdepth(state, &split_cu_loc, max_depth, nosplit_cost, search_data, lcu, tree_type);
     if (split_cost < nosplit_cost) {
-      split_cost += search_intra_trdepth(state, x_px + offset, y_px, depth + 1, max_depth, nosplit_cost, search_data, lcu, tree_type);
+      uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x + half_width, cu_loc->y, half_width, half_height);
+      split_cost += search_intra_trdepth(state, &split_cu_loc, max_depth, nosplit_cost, search_data, lcu, tree_type);
     }
     if (split_cost < nosplit_cost) {
-      split_cost += search_intra_trdepth(state, x_px, y_px + offset, depth + 1, max_depth, nosplit_cost, search_data, lcu, tree_type);
+      uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x, cu_loc->y + half_height, half_width, half_height);
+      split_cost += search_intra_trdepth(state, &split_cu_loc, max_depth, nosplit_cost, search_data, lcu, tree_type);
     }
     if (split_cost < nosplit_cost) {
-      split_cost += search_intra_trdepth(state, x_px + offset, y_px + offset, depth + 1, max_depth, nosplit_cost, search_data, lcu, tree_type);
+      uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x + half_width, cu_loc->y + half_height, half_width, half_height);
+      split_cost += search_intra_trdepth(state, &split_cu_loc, max_depth, nosplit_cost, search_data, lcu, tree_type);
     }
 
     double cbf_bits = 0.0;
@@ -654,7 +655,7 @@ static double search_intra_trdepth(
   if (depth == 0 || split_cost < nosplit_cost) {
     return split_cost;
   } else {
-    uvg_lcu_fill_trdepth(lcu, x_px, y_px, depth, depth, tree_type);
+    uvg_lcu_fill_trdepth(lcu, cu_loc->x, cu_loc->y, depth, depth, tree_type);
 
     pred_cu->cbf = nosplit_cbf;
 
@@ -1372,17 +1373,16 @@ static void get_rough_cost_for_2n_modes(
  */
 static int8_t search_intra_rdo(
   encoder_state_t * const state,
-  int x_px,
-  int y_px,
-  int depth,
   int modes_to_check,
   intra_search_data_t *search_data,
   lcu_t *lcu,
-  enum uvg_tree_type tree_type)
+  enum uvg_tree_type tree_type,
+  const cu_loc_t* const cu_loc)
 {
+  const int8_t depth = 6 - uvg_g_convert_to_log2[cu_loc->width];
   const int tr_depth = CLIP(1, MAX_PU_DEPTH, depth + state->encoder_control->cfg.tr_depth_intra);
-  const int width = LCU_WIDTH >> depth;
-  const int height = width; // TODO: height for non-square blocks
+  const int width = cu_loc->width;
+  const int height = cu_loc->height; // TODO: height for non-square blocks
   
   for (int mode = 0; mode < modes_to_check; mode++) {
     bool can_do_isp_search = search_data[mode].pred_cu.intra.mip_flag ? false : true; // Cannot use ISP with MIP
@@ -1399,12 +1399,12 @@ static int8_t search_intra_rdo(
       
 
       search_data[mode].pred_cu.intra.isp_mode = isp_mode;
-      double rdo_bitcost = uvg_luma_mode_bits(state, &search_data[mode].pred_cu, x_px, y_px, depth, lcu);
+      double rdo_bitcost = uvg_luma_mode_bits(state, &search_data[mode].pred_cu, cu_loc, lcu);
       search_data[mode].pred_cu.tr_idx = MTS_TR_NUM;
       search_data[mode].bits = rdo_bitcost;
       search_data[mode].cost = rdo_bitcost * state->lambda;
 
-      double mode_cost = search_intra_trdepth(state, x_px, y_px, depth, tr_depth, MAX_INT, &search_data[mode], lcu, tree_type);
+      double mode_cost = search_intra_trdepth(state, cu_loc, tr_depth, MAX_INT, &search_data[mode], lcu, tree_type);
       best_mts_mode_for_isp[isp_mode] = search_data[mode].pred_cu.tr_idx;
       best_lfnst_mode_for_isp[isp_mode] = search_data[mode].pred_cu.lfnst_idx;
       search_data[mode].cost += mode_cost;
@@ -1440,7 +1440,9 @@ static int8_t search_intra_rdo(
 }
 
 
-double uvg_luma_mode_bits(const encoder_state_t *state, const cu_info_t* const cur_cu, int x, int y, int8_t depth, const lcu_t* lcu)
+double uvg_luma_mode_bits(const encoder_state_t *state, const cu_info_t* const cur_cu, const cu_loc_t*
+                          const cu_loc,
+                          const lcu_t* lcu)
 {
   cabac_data_t* cabac = (cabac_data_t *)&state->search_cabac;
   double mode_bits = 0;
@@ -1449,8 +1451,8 @@ double uvg_luma_mode_bits(const encoder_state_t *state, const cu_info_t* const c
   uvg_encode_intra_luma_coding_unit(
     state,
     &cabac_copy, cur_cu,
-    x, y, depth, lcu, &mode_bits
-  );
+    cu_loc, lcu, &mode_bits
+    );
 
   return mode_bits;
 }
@@ -1651,11 +1653,11 @@ int8_t uvg_search_intra_chroma_rdo(
           state->search_cabac.update = 1;
           chroma_data[mode_i].cost = mode_bits * state->lambda;
           uvg_intra_recon_cu(state,
-                             x_px, y_px,
-                             depth, &chroma_data[mode_i],
-                             pred_cu,
-                             lcu,
-                             tree_type, false, true);
+                             &chroma_data[mode_i], &loc,
+                             pred_cu, lcu,
+                             tree_type,
+                             false,
+                             true);
           chroma_data[mode_i].cost += uvg_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu);
           memcpy(&state->search_cabac, &temp_cabac, sizeof(cabac_data_t));
         }
@@ -1829,19 +1831,15 @@ static int select_candidates_for_further_search(const encoder_state_t * const st
  */
 void uvg_search_cu_intra(
   encoder_state_t * const state,
-  const int x_px,
-  const int y_px,
-  const int depth,
   intra_search_data_t* mode_out,
   lcu_t *lcu,
-  enum uvg_tree_type tree_type)
+  enum uvg_tree_type tree_type,
+  const cu_loc_t* const cu_loc)
 {
-  const vector2d_t lcu_px = { SUB_SCU(x_px), SUB_SCU(y_px) };
-  const int8_t cu_width = LCU_WIDTH >> depth;
-  const cu_loc_t cu_loc = { x_px, y_px, cu_width, cu_width,
-    MAX(cu_width >> 1, TR_MIN_WIDTH), MAX(cu_width >> 1, TR_MIN_WIDTH) };
-  const int_fast8_t log2_width = LOG2_LCU_WIDTH - depth;
-  const vector2d_t luma_px = { x_px, y_px };
+  const vector2d_t lcu_px = { cu_loc->local_x, cu_loc->local_y };
+  const int8_t log2_width = uvg_g_convert_to_log2[cu_loc->width];
+  const int8_t log2_height = uvg_g_convert_to_log2[cu_loc->width];
+  const vector2d_t luma_px = { cu_loc->x, cu_loc->y};
   const vector2d_t pic_px = { state->tile->frame->width, state->tile->frame->height };
 
   cu_info_t *cur_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y);
@@ -1857,25 +1855,22 @@ void uvg_search_cu_intra(
 
   // Select left and top CUs if they are available.
   // Top CU is not available across LCU boundary.
-  if (x_px >= SCU_WIDTH) {
-    left_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x - 1, lcu_px.y+ cu_width-1);
+  if (cu_loc->x >= SCU_WIDTH) {
+    left_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x - 1, lcu_px.y+ cu_loc->height-1);
   }
-  if (y_px >= SCU_WIDTH && lcu_px.y > 0) {
-    above_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x+ cu_width-1, lcu_px.y - 1);
+  if (cu_loc->y >= SCU_WIDTH && lcu_px.y > 0) {
+    above_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x+ cu_loc->width-1, lcu_px.y - 1);
   }
-  int8_t num_cand = uvg_intra_get_dir_luma_predictor(x_px, y_px, candidate_modes, cur_cu, left_cu, above_cu);
+  int8_t num_cand = uvg_intra_get_dir_luma_predictor(cu_loc->x, cu_loc->y, candidate_modes, cur_cu, left_cu, above_cu);
 
-  if (depth > 0) {
-    uvg_intra_build_reference(&cu_loc, &cu_loc, COLOR_Y, &luma_px, &pic_px, lcu, refs, state->encoder_control->cfg.wpp, NULL, 0, 0);
+  bool is_large = cu_loc->width > TR_MAX_WIDTH || cu_loc->height > TR_MAX_WIDTH;
+  if (!is_large) {
+    uvg_intra_build_reference(cu_loc, cu_loc, COLOR_Y, &luma_px, &pic_px, lcu, refs, state->encoder_control->cfg.wpp, NULL, 0, 0);
   }
-
-  // The maximum number of possible MIP modes depend on block size & shape
-  int width = LCU_WIDTH >> depth;
-  int height = width; // TODO: proper height for non-square blocks.
-
+  
   // This is needed for bit cost calculation and requires too many parameters to be
   // calculated inside the rough search functions
-  uint8_t mip_ctx = uvg_get_mip_flag_context(x_px, y_px, cu_width, cu_width, lcu, NULL);
+  uint8_t mip_ctx = uvg_get_mip_flag_context(cu_loc, lcu, NULL);
 
   // Find best intra mode for 2Nx2N.
   uvg_pixel *ref_pixels = &lcu->ref.y[lcu_px.x + lcu_px.y * LCU_WIDTH];
@@ -1886,15 +1881,15 @@ void uvg_search_cu_intra(
   temp_pred_cu.type = CU_INTRA;
   FILL(temp_pred_cu.intra, 0);
   // Find modes with multiple reference lines if in use. Do not use if CU in first row.
-  uint8_t lines = state->encoder_control->cfg.mrl && (y_px % LCU_WIDTH) != 0 ? MAX_REF_LINE_IDX : 1;
+  uint8_t lines = state->encoder_control->cfg.mrl && lcu_px.y != 0 ? MAX_REF_LINE_IDX : 1;
 
   uint8_t number_of_modes;
   uint8_t num_regular_modes;
-  bool skip_rough_search = (depth == 0 || state->encoder_control->cfg.rdo >= 4);
+  bool skip_rough_search = (is_large || state->encoder_control->cfg.rdo >= 4);
   if (!skip_rough_search) {
     num_regular_modes = number_of_modes = search_intra_rough(
       state,
-      &cu_loc,
+      cu_loc,
       ref_pixels,
       LCU_WIDTH,
       refs,
@@ -1903,7 +1898,7 @@ void uvg_search_cu_intra(
       search_data,
       &temp_pred_cu,
       mip_ctx);
-     // if(lines == 1) sort_modes(search_data, number_of_modes);
+    // if(lines == 1) sort_modes(search_data, number_of_modes);
 
   } else {
     for (int8_t i = 0; i < UVG_NUM_INTRA_MODES; i++) {
@@ -1925,7 +1920,7 @@ void uvg_search_cu_intra(
 
       // Copy extra ref lines, including ref line 1 and top left corner.
       for (int i = 0; i < MAX_REF_LINE_IDX; ++i) {
-        int height = (LCU_WIDTH >> depth) * 2 + MAX_REF_LINE_IDX;
+        int height = (cu_loc->height) * 2 + MAX_REF_LINE_IDX;
         height = MIN(height, (LCU_WIDTH - lcu_px.y + MAX_REF_LINE_IDX)); // Cut short if on bottom LCU edge. Cannot take references from below since they don't exist.
         height = MIN(height, pic_px.y - luma_px.y + MAX_REF_LINE_IDX);
         uvg_pixels_blit(&frame->rec->y[(luma_px.y - MAX_REF_LINE_IDX) * frame->rec->stride + luma_px.x - (1 + i)],
@@ -1934,7 +1929,7 @@ void uvg_search_cu_intra(
           frame->rec->stride, 1);
       }
     }
-    uvg_intra_build_reference(&cu_loc, &cu_loc, COLOR_Y, &luma_px, &pic_px, lcu, &refs[line], state->encoder_control->cfg.wpp, extra_refs, line, 0);
+    uvg_intra_build_reference(cu_loc, cu_loc, COLOR_Y, &luma_px, &pic_px, lcu, &refs[line], state->encoder_control->cfg.wpp, extra_refs, line, 0);
     for(int i = 1; i < INTRA_MPM_COUNT; i++) {
       num_mrl_modes++;
       const int index = (i - 1) + (INTRA_MPM_COUNT -1)*(line-1) + number_of_modes;
@@ -1946,7 +1941,7 @@ void uvg_search_cu_intra(
     }
   }
   if (!skip_rough_search && lines != 1) {
-    get_rough_cost_for_2n_modes(state, refs, &cu_loc,
+    get_rough_cost_for_2n_modes(state, refs, cu_loc,
                                 ref_pixels,
                                 LCU_WIDTH, search_data + number_of_modes, num_mrl_modes,
                                 mip_ctx);
@@ -1959,11 +1954,11 @@ void uvg_search_cu_intra(
   int num_mip_modes = 0;
   if (state->encoder_control->cfg.mip) {
     // MIP is not allowed for 64 x 4 or 4 x 64 blocks
-    if (!((width == 64 && height == 4) || (width == 4 && height == 64))) {
-      num_mip_modes = NUM_MIP_MODES_FULL(width, height);
+    if (!((cu_loc->height == 64 && cu_loc->width== 4) || (cu_loc->height== 4 && cu_loc->width == 64))) {
+      num_mip_modes = NUM_MIP_MODES_FULL(cu_loc->width, cu_loc->height);
 
       for (int transpose = 0; transpose < 2; transpose++) {
-        const int half_mip_modes = NUM_MIP_MODES_HALF(width, height);
+        const int half_mip_modes = num_mip_modes / 2;
         for (int i = 0; i < half_mip_modes; ++i) {
           const int index = i + number_of_modes + transpose * half_mip_modes;
           search_data[index].pred_cu = temp_pred_cu;
@@ -1975,7 +1970,7 @@ void uvg_search_cu_intra(
         }
       }
       if (!skip_rough_search) {
-        get_rough_cost_for_2n_modes(state, refs, &cu_loc,
+        get_rough_cost_for_2n_modes(state, refs, cu_loc,
           ref_pixels,
           LCU_WIDTH, search_data + number_of_modes, num_mip_modes,
           mip_ctx);
@@ -1986,7 +1981,10 @@ void uvg_search_cu_intra(
 
 
   // Set transform depth to current depth, meaning no transform splits.
-  uvg_lcu_fill_trdepth(lcu, x_px, y_px, depth, depth, tree_type);
+  {
+    const int8_t depth = 6 - uvg_g_convert_to_log2[cu_loc->width];
+    uvg_lcu_fill_trdepth(lcu, cu_loc->x, cu_loc->y, depth, depth, tree_type);
+  }
   // Refine results with slower search or get some results if rough search was skipped.
   const int32_t rdo_level = state->encoder_control->cfg.rdo;
   if (rdo_level >= 2 || skip_rough_search) {
@@ -2003,7 +2001,7 @@ void uvg_search_cu_intra(
         {2, 3, 3, 3, 3, 2},  //  64x4,  64x8,  64x16,  64x32,  64x64,  64x128,
         {2, 2, 2, 2, 2, 3},  // 128x4, 128x8, 128x16, 128x32, 128x64, 128x128,
       };
-      number_of_modes_to_search = g_aucIntraModeNumFast_UseMPM_2D[7- depth - 3][7 - depth - 3];
+      number_of_modes_to_search = g_aucIntraModeNumFast_UseMPM_2D[log2_width - 2][log2_height - 2];
     } else {
       // Check only the predicted modes.
       number_of_modes_to_search = 0;
@@ -2015,8 +2013,8 @@ void uvg_search_cu_intra(
           search_data,
           num_regular_modes,
           num_mip_modes,
-          width,
-          height
+          cu_loc->width,
+          cu_loc->height
         );
       }
     }
@@ -2041,13 +2039,11 @@ void uvg_search_cu_intra(
     
     search_intra_rdo(
       state,
-      x_px,
-      y_px,
-      depth,
       number_of_modes_to_search,
       search_data,
       lcu,
-      tree_type);
+      tree_type,
+      cu_loc);
     search_data[0].pred_cu.mts_last_scan_pos = false;
     search_data[0].pred_cu.violates_mts_coeff_constraint = false;
   }
diff --git a/src/search_intra.h b/src/search_intra.h
index 36470e63..e9264275 100644
--- a/src/search_intra.h
+++ b/src/search_intra.h
@@ -43,7 +43,9 @@
 #include "global.h" // IWYU pragma: keep
 #include "intra.h"
 
-double uvg_luma_mode_bits(const encoder_state_t *state, const cu_info_t* const cur_cu, int x, int y, int8_t depth, const lcu_t* lcu);
+double uvg_luma_mode_bits(const encoder_state_t *state, const cu_info_t* const cur_cu, const cu_loc_t*
+                          const cu_loc,
+                          const lcu_t* lcu);
                        
 double uvg_chroma_mode_bits(const encoder_state_t *state,
                         int8_t chroma_mode, int8_t luma_mode);
@@ -59,11 +61,9 @@ int8_t uvg_search_cu_intra_chroma(
 
 void uvg_search_cu_intra(
   encoder_state_t * const state,
-  const int x_px,
-  const int y_px,
-  const int depth,
   intra_search_data_t* search_data,
   lcu_t *lcu,
-  enum uvg_tree_type tree_type);
+  enum uvg_tree_type tree_type,
+  const cu_loc_t* const cu_loc);
 
 #endif // SEARCH_INTRA_H_
diff --git a/src/strategies/generic/dct-generic.c b/src/strategies/generic/dct-generic.c
index 72cd1fb1..c352b395 100644
--- a/src/strategies/generic/dct-generic.c
+++ b/src/strategies/generic/dct-generic.c
@@ -2641,8 +2641,8 @@ static void mts_idct_generic(
 
     if (tu->lfnst_idx || tu->cr_lfnst_idx) {
       if ((width == 4 && height > 4) || (width > 4 && height == 4)) {
-        skip_width == width - 4;
-        skip_height == height - 4;
+        skip_width = width - 4;
+        skip_height = height - 4;
       }
       else if ((width >= 8 && height >= 8)) {
         skip_width = width - 8;
diff --git a/src/transform.c b/src/transform.c
index a497003b..84eb3558 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -174,7 +174,6 @@ int32_t uvg_get_scaled_qp(color_t color, int8_t qp, int8_t qp_offset, int8_t con
 */
 void uvg_derive_lfnst_constraints(
   cu_info_t* const pred_cu,
-  const int depth,
   bool* constraints,
   const coeff_t* coeff,
   const int width,
@@ -182,7 +181,7 @@ void uvg_derive_lfnst_constraints(
   const vector2d_t * const lcu_px,
   color_t color)
 {
-  coeff_scan_order_t scan_idx = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode, depth);
+  coeff_scan_order_t scan_idx = SCAN_DIAG;
   // ToDo: large block support in VVC?
 
   const uint32_t log2_block_size = uvg_g_convert_to_log2[width];
@@ -584,9 +583,9 @@ void uvg_chroma_transform_search(
     
     if(pred_cu->type == CU_INTRA && transforms[i] != CHROMA_TS && (depth == 4 || tree_type == UVG_CHROMA_T)) {
       bool constraints[2] = { false, false };
-      uvg_derive_lfnst_constraints(pred_cu, depth, constraints, u_quant_coeff, width, height, NULL, COLOR_U);
+      uvg_derive_lfnst_constraints(pred_cu, constraints, u_quant_coeff, width, height, NULL, COLOR_U);
       if(!IS_JCCR_MODE(transforms[i])) {
-        uvg_derive_lfnst_constraints(pred_cu, depth, constraints, v_quant_coeff, width, height, NULL, COLOR_V);
+        uvg_derive_lfnst_constraints(pred_cu, constraints, v_quant_coeff, width, height, NULL, COLOR_V);
       }
       if (!constraints[1] && (u_has_coeffs || v_has_coeffs) && pred_cu->cr_lfnst_idx != 0) continue;
     }
diff --git a/src/transform.h b/src/transform.h
index a7427ea0..4d5e8ba8 100644
--- a/src/transform.h
+++ b/src/transform.h
@@ -74,7 +74,6 @@ int32_t uvg_get_scaled_qp(color_t color, int8_t qp, int8_t qp_offset, int8_t con
 
 void uvg_derive_lfnst_constraints(
   cu_info_t* const pred_cu,
-  const int depth,
   bool* constraints,
   const coeff_t* coeff,
   const int width,

From 6a0864839c513649787857b4715f823263b98d8a Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Thu, 8 Sep 2022 15:10:54 +0300
Subject: [PATCH 092/254] [mtt] Actually remove the last width dependency to
 depth

---
 src/encode_coding_tree.c                      |  83 ++--
 src/encode_coding_tree.h                      |   7 +-
 src/encoderstate.c                            |  45 ++-
 src/encoderstate.h                            |   9 +-
 src/filter.c                                  |   3 +-
 src/global.h                                  |   1 -
 src/intra.c                                   |   2 +-
 src/rdo.c                                     |   5 +-
 src/rdo.h                                     |   3 +-
 src/search.c                                  | 353 ++++++++++--------
 src/search.h                                  |  28 +-
 src/search_inter.c                            |  18 +-
 src/search_intra.c                            |  95 ++---
 src/search_intra.h                            |   4 +-
 src/strategies/avx2/quant-avx2.c              |   2 +-
 .../generic/encode_coding_tree-generic.c      |   6 +-
 .../generic/encode_coding_tree-generic.h      |   2 +-
 src/strategies/generic/quant-generic.c        |   6 +-
 src/strategies/strategies-encode.h            |   2 +-
 src/transform.c                               |  27 +-
 src/transform.h                               |   2 -
 tests/test_cabac_state.sh                     |   4 +-
 22 files changed, 360 insertions(+), 347 deletions(-)

diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index fcb6d308..8c19df4b 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -47,12 +47,13 @@
 #include "tables.h"
 #include "videoframe.h"
 
-bool uvg_is_mts_allowed(const encoder_state_t * const state, cu_info_t *const pred_cu)
+bool uvg_is_mts_allowed(const encoder_state_t * const state, cu_info_t *const pred_cu, const cu_loc_t*
+                        const cu_loc)
 {
   uint32_t ts_max_size = 1 << state->encoder_control->cfg.trskip_max_size; 
   const uint32_t max_size = 32; // CU::isIntra(cu) ? MTS_INTRA_MAX_CU_SIZE : MTS_INTER_MAX_CU_SIZE;
-  const uint32_t cu_width    = LCU_WIDTH >> pred_cu->depth;
-  const uint32_t cu_height   = LCU_WIDTH >> pred_cu->depth;
+  const uint32_t cu_width    = cu_loc->width;
+  const uint32_t cu_height   = cu_loc->height;
   //bool mts_allowed = cu.chType == CHANNEL_TYPE_LUMA && compID == COMPONENT_Y;
 
   uint8_t mts_type = state->encoder_control->cfg.mts;
@@ -66,14 +67,16 @@ bool uvg_is_mts_allowed(const encoder_state_t * const state, cu_info_t *const pr
   return mts_allowed;
 }
 
-static void encode_mts_idx(encoder_state_t * const state,
+static void encode_mts_idx(
+  encoder_state_t * const state,
   cabac_data_t * const cabac,
-  const cu_info_t *const pred_cu)
+  const cu_info_t *const pred_cu,
+  const cu_loc_t* const cu_loc)
 {
   //TransformUnit &tu = *cu.firstTU;
   int mts_idx = pred_cu->tr_idx;
 
-  if (uvg_is_mts_allowed(state, (cu_info_t* const )pred_cu) && mts_idx != MTS_SKIP
+  if (uvg_is_mts_allowed(state, (cu_info_t* const )pred_cu, cu_loc) && mts_idx != MTS_SKIP
        && !pred_cu->violates_mts_coeff_constraint
        && pred_cu->mts_last_scan_pos       
     )
@@ -498,7 +501,7 @@ void uvg_encode_last_significant_xy(cabac_data_t * const cabac,
 
 static void encode_chroma_tu(
   encoder_state_t* const state,
-  const cu_loc_t *cu_loc,
+  const cu_loc_t * const cu_loc,
   int depth,
   cu_info_t* cur_pu,
   int8_t* scan_idx,
@@ -541,8 +544,7 @@ static void encode_chroma_tu(
     }
   }
   else {
-    // const coeff_t *coeff_uv = &coeff->joint_uv[xy_to_zorder(LCU_WIDTH_C, x_local, y_local)];
-    const coeff_t coeff_uv[TR_MAX_WIDTH * TR_MAX_WIDTH];
+    coeff_t coeff_uv[TR_MAX_WIDTH * TR_MAX_WIDTH];
     uvg_get_sub_coeff(coeff_uv, coeff->joint_uv, x_local, y_local, cu_loc->chroma_width, cu_loc->chroma_height, LCU_WIDTH_C);
     if (state->encoder_control->cfg.trskip_enable && width_c <= (1 << state->encoder_control->cfg.trskip_max_size)) {
       cabac->cur_ctx = &cabac->ctx.transform_skip_model_chroma;
@@ -700,7 +702,7 @@ static void encode_transform_coeff(
   }
   */
 
-  int8_t split = (LCU_WIDTH >> depth > TR_MAX_WIDTH);
+  int8_t split = (cu_loc->width > TR_MAX_WIDTH || cu_loc->height > TR_MAX_WIDTH);
 
   const int cb_flag_y = tree_type != UVG_CHROMA_T ? cbf_is_set(cur_pu->cbf, depth, COLOR_Y) : 0;
   const int cb_flag_u = tree_type != UVG_LUMA_T ?( cur_pu->joint_cb_cr ? (cur_pu->joint_cb_cr >> 1) & 1 : cbf_is_set(cur_cu->cbf, depth, COLOR_U)) : 0;
@@ -1290,15 +1292,13 @@ bool uvg_write_split_flag(
   const cu_info_t * left_cu,
   const cu_info_t * above_cu,
   uint8_t split_flag,
+  const cu_loc_t* const cu_loc,
   int depth,
-  int cu_width,
-  int x,
-  int y,
   enum uvg_tree_type tree_type,
   double* bits_out)
 {
-  uint16_t abs_x = x + (state->tile->offset_x >> (tree_type == UVG_CHROMA_T));
-  uint16_t abs_y = y + (state->tile->offset_y >> (tree_type == UVG_CHROMA_T));
+  uint16_t abs_x = (cu_loc->x + state->tile->offset_x) >> (tree_type == UVG_CHROMA_T);
+  uint16_t abs_y = (cu_loc->y + state->tile->offset_y) >> (tree_type == UVG_CHROMA_T);
   double bits = 0;
   const encoder_control_t* const ctrl = state->encoder_control;
   // Implisit split flag when on border
@@ -1311,10 +1311,12 @@ bool uvg_write_split_flag(
   // ToDo: update this when btt is actually used
   bool allow_btt = false;// when mt_depth < MAX_BT_DEPTH
   
+  const int cu_width = tree_type != UVG_CHROMA_T ? cu_loc->width : cu_loc->chroma_width;
+  const int cu_height = tree_type != UVG_CHROMA_T ? cu_loc->height : cu_loc->chroma_height;
 
   uint8_t implicit_split_mode = UVG_NO_SPLIT;
   //bool implicit_split = border;
-  bool bottom_left_available = ((abs_y + cu_width - 1) < (ctrl->in.height >> (tree_type == UVG_CHROMA_T)));
+  bool bottom_left_available = ((abs_y + cu_height - 1) < (ctrl->in.height >> (tree_type == UVG_CHROMA_T)));
   bool top_right_available = ((abs_x + cu_width - 1) < (ctrl->in.width >> (tree_type == UVG_CHROMA_T)));
 
   if (!bottom_left_available && !top_right_available && allow_qt) {
@@ -1349,11 +1351,11 @@ bool uvg_write_split_flag(
   if (no_split && allow_split) {
     // Get left and top block split_flags and if they are present and true, increase model number
     // ToDo: should use height and width to increase model, PU_GET_W() ?
-    if (left_cu && LCU_WIDTH >> left_cu->depth < LCU_WIDTH >> depth) {
+    if (left_cu && left_cu->depth > depth) {
       split_model++;
     }
 
-    if (above_cu && LCU_WIDTH >> above_cu->depth < LCU_WIDTH >> depth) {
+    if (above_cu && above_cu->depth > depth) {
       split_model++;
     }
 
@@ -1457,7 +1459,16 @@ void uvg_encode_coding_tree(
   // When not in MAX_DEPTH, insert split flag and split the blocks if needed
   if (depth != MAX_DEPTH && !(tree_type == UVG_CHROMA_T && depth == MAX_DEPTH -1)) {
 
-    const int split_flag = uvg_write_split_flag(state, cabac, left_cu, above_cu, (cur_cu->split_tree >> (split_tree.current_depth * 3)) & 7, depth, cu_width, x, y, tree_type,NULL);
+    const int split_flag = uvg_write_split_flag(
+      state,
+      cabac,
+      left_cu,
+      above_cu,
+      (cur_cu->split_tree >> (split_tree.current_depth * 3)) & 7,
+      cu_loc,
+      depth,
+      tree_type,
+      NULL);
     
     if (split_flag || border) {
       const int half_luma = cu_loc->width / 2;
@@ -1597,8 +1608,8 @@ void uvg_encode_coding_tree(
     uvg_pixel *rec_base_v = &frame->rec->v[x / 2 + y / 2 * ctrl->in.width / 2];
 
     // Luma
-    for (unsigned y_px = 0; y_px < LCU_WIDTH >> depth; y_px++) {
-      for (unsigned x_px = 0; x_px < LCU_WIDTH >> depth; x_px++) {
+    for (unsigned y_px = 0; y_px < cu_height; y_px++) {
+      for (unsigned x_px = 0; x_px < cu_width; x_px++) {
         uvg_bitstream_put(cabac->stream, base_y[x_px + y_px * ctrl->in.width], 8);
         rec_base_y[x_px + y_px * ctrl->in.width] = base_y[x_px + y_px * ctrl->in.width];
       }
@@ -1606,14 +1617,14 @@ void uvg_encode_coding_tree(
 
     // Chroma
     if (ctrl->chroma_format != UVG_CSP_400) {
-      for (unsigned y_px = 0; y_px < LCU_WIDTH >> (depth + 1); y_px++) {
-        for (unsigned x_px = 0; x_px < LCU_WIDTH >> (depth + 1); x_px++) {
+      for (unsigned y_px = 0; y_px < cu_loc->chroma_height; y_px++) {
+        for (unsigned x_px = 0; x_px < cu_loc->chroma_width; x_px++) {
           uvg_bitstream_put(cabac->stream, base_u[x_px + y_px * (ctrl->in.width >> 1)], 8);
           rec_base_u[x_px + y_px * (ctrl->in.width >> 1)] = base_u[x_px + y_px * (ctrl->in.width >> 1)];
         }
       }
-      for (unsigned y_px = 0; y_px < LCU_WIDTH >> (depth + 1); y_px++) {
-        for (unsigned x_px = 0; x_px < LCU_WIDTH >> (depth + 1); x_px++) {
+      for (unsigned y_px = 0; y_px < cu_loc->chroma_height; y_px++) {
+        for (unsigned x_px = 0; x_px < cu_loc->chroma_width; x_px++) {
           uvg_bitstream_put(cabac->stream, base_v[x_px + y_px * (ctrl->in.width >> 1)], 8);
           rec_base_v[x_px + y_px * (ctrl->in.width >> 1)] = base_v[x_px + y_px * (ctrl->in.width >> 1)];
         }
@@ -1664,7 +1675,7 @@ void uvg_encode_coding_tree(
         encode_transform_coeff(state, &cu_loc, depth, 0, 0, 0, 0, coeff, tree_type, true, false, &luma_cbf_ctx, cu_loc);
       }
 
-      encode_mts_idx(state, cabac, cur_cu);
+      encode_mts_idx(state, cabac, cur_cu, cu_loc);
 
     }
   } else if (cur_cu->type == CU_INTRA) {
@@ -1701,7 +1712,7 @@ void uvg_encode_coding_tree(
     if (tree_type != UVG_CHROMA_T) {
       bool lfnst_written = encode_lfnst_idx(state, cabac, cur_cu, x, y, depth, cu_width, cu_height, tree_type, COLOR_Y);
     }
-    encode_mts_idx(state, cabac, cur_cu);
+    encode_mts_idx(state, cabac, cur_cu, cu_loc);
 
     // For 4x4 the chroma PU/TU is coded after the last 
     if (state->encoder_control->chroma_format != UVG_CSP_400 && 
@@ -1731,7 +1742,7 @@ void uvg_encode_coding_tree(
 
 end:
 
-  if (is_last_cu_in_qg(state, x, y, depth)) {
+  if (is_last_cu_in_qg(state, cu_loc)) {
     state->last_qp = cur_cu->qp;
   }
 
@@ -1752,11 +1763,9 @@ double uvg_mock_encode_coding_unit(
 
   const uint8_t depth = 6 - uvg_g_convert_to_log2[cu_loc->width];
 
-  int x_local = SUB_SCU(x) >> (tree_type == UVG_CHROMA_T);
-  int y_local = SUB_SCU(y) >> (tree_type == UVG_CHROMA_T);
-
-  const int cu_width = LCU_WIDTH >> depth;
-  
+  int x_local = cu_loc->local_x >> (tree_type == UVG_CHROMA_T);
+  int y_local = cu_loc->local_y >> (tree_type == UVG_CHROMA_T);
+    
   const cu_info_t* left_cu = NULL, *above_cu = NULL;
   if (x) {
     if(x_local || tree_type != UVG_CHROMA_T) {
@@ -1787,16 +1796,14 @@ double uvg_mock_encode_coding_unit(
       left_cu,
       above_cu,
       0,
+      cu_loc,
       depth,
-      cu_width >> (tree_type == UVG_CHROMA_T),
-      x >> (tree_type == UVG_CHROMA_T),
-      y >> (tree_type == UVG_CHROMA_T),
       tree_type,
       &bits);
   }
 
   // Encode skip flag
-  if (state->frame->slicetype != UVG_SLICE_I && cu_width != 4) {
+  if (state->frame->slicetype != UVG_SLICE_I && (cu_loc->width != 4 || cu_loc->height != 4)) {
     int8_t ctx_skip = 0;
 
     if (left_cu && left_cu->skipped) {
@@ -1829,7 +1836,7 @@ double uvg_mock_encode_coding_unit(
     }
   }
   // Prediction mode
-  if (state->frame->slicetype != UVG_SLICE_I && cu_width != 4) {
+  if (state->frame->slicetype != UVG_SLICE_I && (cu_loc->width != 4 || cu_loc->height != 4)) {
 
     int8_t ctx_predmode = 0;
 
diff --git a/src/encode_coding_tree.h b/src/encode_coding_tree.h
index 6c0c2cd1..e75ad46a 100644
--- a/src/encode_coding_tree.h
+++ b/src/encode_coding_tree.h
@@ -40,7 +40,8 @@
 #include "encoderstate.h"
 #include "global.h"
 
-bool uvg_is_mts_allowed(const encoder_state_t* const state, cu_info_t* const pred_cu);
+bool uvg_is_mts_allowed(const encoder_state_t* const state, cu_info_t* const pred_cu, const cu_loc_t*
+                        const cu_loc);
 bool uvg_is_lfnst_allowed(
   const encoder_state_t* const state,
   const cu_info_t* const pred_cu,
@@ -105,10 +106,8 @@ bool uvg_write_split_flag(
   const cu_info_t* left_cu,
   const cu_info_t* above_cu,
   uint8_t split_flag,
+  const cu_loc_t* const cu_loc,
   int depth,
-  int cu_width,
-  int x,
-  int y,
   enum uvg_tree_type tree_type,
   double* bits_out);
 
diff --git a/src/encoderstate.c b/src/encoderstate.c
index dc3416e3..593ffd16 100644
--- a/src/encoderstate.c
+++ b/src/encoderstate.c
@@ -627,36 +627,45 @@ static void encode_sao(encoder_state_t * const state,
  * \param prev_qp         -1 if QP delta has not been coded in current QG,
  *                        otherwise the QP of the current QG
  */
-static void set_cu_qps(encoder_state_t *state, int x, int y, int depth, int *last_qp, int *prev_qp)
+static void set_cu_qps(encoder_state_t *state, const cu_loc_t* const cu_loc, int *last_qp, int *prev_qp, const
+                       int depth)
 {
 
   // Stop recursion if the CU is completely outside the frame.
-  if (x >= state->tile->frame->width || y >= state->tile->frame->height) return;
+  if (cu_loc->x >= state->tile->frame->width || cu_loc->y >= state->tile->frame->height) return;
 
-  cu_info_t *cu = uvg_cu_array_at(state->tile->frame->cu_array, x, y);
-  const int cu_width = LCU_WIDTH >> depth;
+  cu_info_t *cu = uvg_cu_array_at(state->tile->frame->cu_array, cu_loc->x, cu_loc->y);
+  const int width = LCU_WIDTH >> cu->depth;
 
   if (depth <= state->frame->max_qp_delta_depth) {
     *prev_qp = -1;
   }
 
-  if (cu->depth > depth) {
+  if (cu_loc->width > width) {
     // Recursively process sub-CUs.
-    const int d = cu_width >> 1;
-    set_cu_qps(state, x,     y,     depth + 1, last_qp, prev_qp);
-    set_cu_qps(state, x + d, y,     depth + 1, last_qp, prev_qp);
-    set_cu_qps(state, x,     y + d, depth + 1, last_qp, prev_qp);
-    set_cu_qps(state, x + d, y + d, depth + 1, last_qp, prev_qp);
+    const int half_width = cu_loc->width >> 1;
+    const int half_height = cu_loc->height >> 1;
+    cu_loc_t split_cu_loc;
+    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x, cu_loc->y, half_width, half_height);
+    set_cu_qps(state, &split_cu_loc,     last_qp,     prev_qp, depth + 1);
+    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x + half_width, cu_loc->y, half_width, half_height);
+    set_cu_qps(state, &split_cu_loc, last_qp,     prev_qp, depth + 1);
+    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x, cu_loc->y + half_height, half_width, half_height);
+    set_cu_qps(state, &split_cu_loc,     last_qp, prev_qp, depth + 1);
+    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x + half_width, cu_loc->y + half_height, half_width, half_height);
+    set_cu_qps(state, &split_cu_loc, last_qp, prev_qp, depth + 1);
 
   } else {
     bool cbf_found = *prev_qp >= 0;
 
+    int y_limit = cu_loc->y + cu_loc->height;
+    int x_limit = cu_loc->x + cu_loc->width;
     if (cu->tr_depth > depth) {
       // The CU is split into smaller transform units. Check whether coded
       // block flag is set for any of the TUs.
       const int tu_width = LCU_WIDTH >> cu->tr_depth;
-      for (int y_scu = y; !cbf_found && y_scu < y + cu_width; y_scu += tu_width) {
-        for (int x_scu = x; !cbf_found && x_scu < x + cu_width; x_scu += tu_width) {
+      for (int y_scu = cu_loc->y; !cbf_found && y_scu < y_limit; y_scu += tu_width) {
+        for (int x_scu = cu_loc->x; !cbf_found && x_scu < x_limit; x_scu += tu_width) {
           cu_info_t *tu = uvg_cu_array_at(state->tile->frame->cu_array, x_scu, y_scu);
           if (cbf_is_set_any(tu->cbf, cu->depth)) {
             cbf_found = true;
@@ -671,18 +680,18 @@ static void set_cu_qps(encoder_state_t *state, int x, int y, int depth, int *las
     if (cbf_found) {
       *prev_qp = qp = cu->qp;
     } else {
-      qp = uvg_get_cu_ref_qp(state, x, y, *last_qp);
+      qp = uvg_get_cu_ref_qp(state, cu_loc->x, cu_loc->y, *last_qp);
     }
 
     // Set the correct QP for all state->tile->frame->cu_array elements in
     // the area covered by the CU.
-    for (int y_scu = y; y_scu < y + cu_width; y_scu += SCU_WIDTH) {
-      for (int x_scu = x; x_scu < x + cu_width; x_scu += SCU_WIDTH) {
+    for (int y_scu = cu_loc->y; y_scu < y_limit; y_scu += SCU_WIDTH) {
+      for (int x_scu = cu_loc->x; x_scu < x_limit; x_scu += SCU_WIDTH) {
         uvg_cu_array_at(state->tile->frame->cu_array, x_scu, y_scu)->qp = qp;
       }
     }
 
-    if (is_last_cu_in_qg(state, x, y, depth)) {
+    if (is_last_cu_in_qg(state, cu_loc)) {
       *last_qp = cu->qp;
     }
   }
@@ -812,7 +821,9 @@ static void encoder_state_worker_encode_lcu_search(void * opaque)
   if (state->frame->max_qp_delta_depth >= 0) {
     int last_qp = state->last_qp;
     int prev_qp = -1;
-    set_cu_qps(state, lcu->position_px.x, lcu->position_px.y, 0, &last_qp, &prev_qp);
+    cu_loc_t cu_loc;
+    uvg_cu_loc_ctor(&cu_loc, lcu->position_px.x, lcu->position_px.y, LCU_WIDTH, LCU_WIDTH);
+    set_cu_qps(state, &cu_loc, &last_qp, &prev_qp, 0);
   }
 
   if (state->tile->frame->lmcs_aps->m_sliceReshapeInfo.sliceReshaperEnableFlag) {
diff --git a/src/encoderstate.h b/src/encoderstate.h
index 55d265e3..6cad3e36 100644
--- a/src/encoderstate.h
+++ b/src/encoderstate.h
@@ -401,14 +401,13 @@ static INLINE bool encoder_state_must_write_vps(const encoder_state_t *state)
  * \param depth   depth in the CU tree
  * \return true, if it's the last CU in its QG, otherwise false
  */
-static INLINE bool is_last_cu_in_qg(const encoder_state_t *state, int x, int y, int depth)
+static INLINE bool is_last_cu_in_qg(const encoder_state_t *state, const cu_loc_t* const cu_loc)
 {
   if (state->frame->max_qp_delta_depth < 0) return false;
-
-  const int cu_width = LCU_WIDTH >> depth;
+  
   const int qg_width = LCU_WIDTH >> state->frame->max_qp_delta_depth;
-  const int right  = x + cu_width;
-  const int bottom = y + cu_width;
+  const int right  = cu_loc->x + cu_loc->width;
+  const int bottom = cu_loc->y + cu_loc->height;
   return (right % qg_width == 0 || right >= state->tile->frame->width) &&
          (bottom % qg_width == 0 || bottom >= state->tile->frame->height);
 }
diff --git a/src/filter.c b/src/filter.c
index 26a57100..b366dd4e 100644
--- a/src/filter.c
+++ b/src/filter.c
@@ -856,8 +856,7 @@ static void filter_deblock_edge_luma(encoder_state_t * const state,
       uint8_t max_filter_length_Q = 0;
       const int cu_size = LCU_WIDTH >> cu_q->depth;
       // TODO: NON square
-      const int pu_size = dir == EDGE_HOR ? cu_size
-                                          : cu_size;
+      const int pu_size = dir == EDGE_HOR ? cu_size : cu_size;
       const int pu_pos = dir == EDGE_HOR ? y_coord 
                                          : x_coord;
       get_max_filter_length(&max_filter_length_P, &max_filter_length_Q, state, x_coord, y_coord,
diff --git a/src/global.h b/src/global.h
index 65ca2fa9..e4a11b20 100644
--- a/src/global.h
+++ b/src/global.h
@@ -273,7 +273,6 @@ typedef int32_t mv_t;
 #define CLIP_TO_PIXEL(value) CLIP(0, PIXEL_MAX, (value))
 #define CLIP_TO_QP(value) CLIP(0, 51, (value))
 #define SWAP(a,b,swaptype) { swaptype tempval; tempval = a; a = b; b = tempval; }
-#define CU_WIDTH_FROM_DEPTH(depth) (LCU_WIDTH >> depth)
 #define WITHIN(val, min_val, max_val) ((min_val) <= (val) && (val) <= (max_val))
 #define CEILDIV(x,y) (((x) + (y) - 1) / (y))
 
diff --git a/src/intra.c b/src/intra.c
index 764ac072..8f87104f 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -1555,7 +1555,7 @@ void uvg_intra_predict(
     uvg_pixels_blit(&state->tile->frame->cclm_luma_rec[x / 2 + (y * stride) / 4], dst, width, width, stride / 2, width);
     if (data->pred_cu.depth != data->pred_cu.tr_depth || data->cclm_parameters[color == COLOR_U ? 0 : 1].b <= 0) {
       predict_cclm(
-        state, color, width, width, x, y, stride, intra_mode, lcu, refs, dst, 
+        state, color, width, height, x, y, stride, intra_mode, lcu, refs, dst, 
         (cclm_parameters_t*)&data->cclm_parameters[color == COLOR_U ? 0 : 1],
         tree_type);
     }
diff --git a/src/rdo.c b/src/rdo.c
index f7eb2a9e..262b4f83 100644
--- a/src/rdo.c
+++ b/src/rdo.c
@@ -297,7 +297,7 @@ out:
 static INLINE double get_coeff_cabac_cost(
   const encoder_state_t * const state,
   const coeff_t *coeff,
-  cu_loc_t *cu_loc,
+  const cu_loc_t* const cu_loc,
   color_t color,
   int8_t scan_mode,
   int8_t tr_skip,
@@ -415,7 +415,7 @@ double uvg_get_coeff_cost(
   const encoder_state_t * const state,
   const coeff_t *coeff,
   cu_info_t* cur_tu,
-  cu_loc_t *cu_loc,
+  const cu_loc_t* const cu_loc,
   color_t color,
   int8_t scan_mode,
   int8_t tr_skip,
@@ -1409,7 +1409,6 @@ void uvg_rdoq(
   int8_t color,
   int8_t scan_mode,
   int8_t block_type,
-  int8_t tr_depth,
   uint16_t cbf,
   uint8_t lfnst_idx)
 {
diff --git a/src/rdo.h b/src/rdo.h
index eb9714f6..2b557651 100644
--- a/src/rdo.h
+++ b/src/rdo.h
@@ -60,7 +60,6 @@ void  uvg_rdoq(
   int8_t type,
   int8_t scan_mode,
   int8_t block_type,
-  int8_t tr_depth,
   uint16_t cbf,
   uint8_t lfnst_idx);
 
@@ -73,7 +72,7 @@ double uvg_get_coeff_cost(
   const encoder_state_t * const state,
   const coeff_t *coeff,
   cu_info_t* cur_tu,
-  cu_loc_t *cu_loc,
+  const cu_loc_t* const cu_loc,
   color_t color,
   int8_t scan_mode,
   int8_t tr_skip,
diff --git a/src/search.c b/src/search.c
index 1d992077..11d934b9 100644
--- a/src/search.c
+++ b/src/search.c
@@ -63,30 +63,39 @@
 static const int INTRA_THRESHOLD = 8;
 
 
-static INLINE void copy_cu_info(int x_local, int y_local, int width, lcu_t *from, lcu_t *to)
+static INLINE void copy_cu_info(lcu_t *from, lcu_t *to, const cu_loc_t* const cu_loc, enum uvg_tree_type
+                                tree_type)
 {
-  for   (int y = y_local; y < y_local + width; y += SCU_WIDTH) {
-    for (int x = x_local; x < x_local + width; x += SCU_WIDTH) {
+  const int y_limit = (cu_loc->local_y + cu_loc->height) >> (tree_type == UVG_CHROMA_T);
+  const int x_limit = (cu_loc->local_x + cu_loc->width) >> (tree_type == UVG_CHROMA_T);
+  for   (int y = cu_loc->local_y >> (tree_type == UVG_CHROMA_T); y < y_limit; y += SCU_WIDTH) {
+    for (int x = cu_loc->local_x >> (tree_type == UVG_CHROMA_T); x < x_limit; x += SCU_WIDTH) {
       *LCU_GET_CU_AT_PX(to, x, y) = *LCU_GET_CU_AT_PX(from, x, y);
     }
   }
 }
 
-static INLINE void copy_cu_pixels(int x_local, int y_local, int width, lcu_t *from, lcu_t *to, enum uvg_tree_type
-                                  tree_type)
+static INLINE void copy_cu_pixels(
+  lcu_t *from,
+  lcu_t *to,
+  const cu_loc_t* const cu_loc,
+  enum uvg_tree_type
+  tree_type)
 {
+  const int x_local = cu_loc->local_x >> (tree_type == UVG_CHROMA_T);
+  const int y_local = cu_loc->local_y >> (tree_type == UVG_CHROMA_T);
   const int luma_index = x_local + y_local * LCU_WIDTH;
   const int chroma_index = tree_type == UVG_CHROMA_T ? x_local + y_local * LCU_WIDTH_C : (x_local / 2) + (y_local / 2) * LCU_WIDTH_C;
 
   if(tree_type != UVG_CHROMA_T) {
     uvg_pixels_blit(&from->rec.y[luma_index], &to->rec.y[luma_index],
-                    width, width, LCU_WIDTH, LCU_WIDTH);
+                    cu_loc->width, cu_loc->height, LCU_WIDTH, LCU_WIDTH);
   }
   if (from->rec.chroma_format != UVG_CSP_400 && tree_type != UVG_LUMA_T) {
     uvg_pixels_blit(&from->rec.u[chroma_index], &to->rec.u[chroma_index],
-                    width / 2, width / 2, LCU_WIDTH / 2, LCU_WIDTH / 2);
+                    cu_loc->chroma_width, cu_loc->chroma_height, LCU_WIDTH_C, LCU_WIDTH_C);
     uvg_pixels_blit(&from->rec.v[chroma_index], &to->rec.v[chroma_index],
-                    width / 2, width / 2, LCU_WIDTH / 2, LCU_WIDTH / 2);
+                    cu_loc->chroma_width, cu_loc->chroma_height, LCU_WIDTH_C, LCU_WIDTH_C);
   }
 }
 
@@ -103,8 +112,8 @@ static INLINE void copy_cu_coeffs(const cu_loc_t *cu_loc, lcu_t *from, lcu_t *to
 
   if (from->rec.chroma_format != UVG_CSP_400 && tree_type != UVG_LUMA_T) {
     //const int chroma_z = xy_to_zorder(LCU_WIDTH_C, cu_loc->x >> (tree_type != UVG_CHROMA_T), cu_loc->y >> (tree_type != UVG_CHROMA_T));
-    const int chroma_x = cu_loc->x >> (tree_type != UVG_CHROMA_T);
-    const int chroma_y = cu_loc->y >> (tree_type != UVG_CHROMA_T);
+    const int chroma_x = (cu_loc->x >> 1) & ~3;
+    const int chroma_y = (cu_loc->y >> 1) & ~3;
 
     const int idx = (chroma_x % LCU_WIDTH_C) + ((chroma_y % LCU_WIDTH_C) * LCU_WIDTH_C);
     copy_coeffs(&from->coeff.u[idx], &to->coeff.u[idx], cu_loc->chroma_width, cu_loc->chroma_height, LCU_WIDTH_C);
@@ -118,15 +127,17 @@ static INLINE void copy_cu_coeffs(const cu_loc_t *cu_loc, lcu_t *from, lcu_t *to
 /**
  * Copy all non-reference CU data from next level to current level.
  */
-static void work_tree_copy_up(int x_local, int y_local, int depth, lcu_t *work_tree, bool joint, enum
-                              uvg_tree_type tree_type)
+static void work_tree_copy_up(
+  lcu_t *work_tree,
+  bool joint,
+  enum
+  uvg_tree_type tree_type,
+  const cu_loc_t* const cu_loc,
+  const int depth)
 {
-  const int width = LCU_WIDTH >> depth;
-  cu_loc_t loc;
-  uvg_cu_loc_ctor(&loc, x_local, y_local, width, width);
-  copy_cu_info  (x_local, y_local, width, &work_tree[depth + 1], &work_tree[depth]);
-  copy_cu_pixels(x_local, y_local, width, &work_tree[depth + 1], &work_tree[depth], tree_type);
-  copy_cu_coeffs(&loc, &work_tree[depth + 1], &work_tree[depth], joint, tree_type);
+  copy_cu_info  (&work_tree[depth + 1], &work_tree[depth], cu_loc, tree_type);
+  copy_cu_pixels(&work_tree[depth + 1], &work_tree[depth], cu_loc, tree_type);
+  copy_cu_coeffs(cu_loc, &work_tree[depth + 1], &work_tree[depth], joint, tree_type);
   
 }
 
@@ -134,24 +145,32 @@ static void work_tree_copy_up(int x_local, int y_local, int depth, lcu_t *work_t
 /**
  * Copy all non-reference CU data from current level to all lower levels.
  */
-static void work_tree_copy_down(int x_local, int y_local, int depth, lcu_t *work_tree, enum uvg_tree_type
-                                tree_type)
+static void work_tree_copy_down(
+  int depth,
+  lcu_t *work_tree,
+  enum uvg_tree_type
+  tree_type,
+  const cu_loc_t* const cu_loc)
 {
-  const int width = tree_type != UVG_CHROMA_T ? LCU_WIDTH >> depth : LCU_WIDTH_C >> 1;
   for (int i = depth + 1; i <= MAX_PU_DEPTH; i++) {
-    copy_cu_info  (x_local, y_local, width, &work_tree[depth], &work_tree[i]);
-    copy_cu_pixels(x_local, y_local, LCU_WIDTH >> depth, &work_tree[depth], &work_tree[i], tree_type);
+    copy_cu_info  (&work_tree[depth], &work_tree[i], cu_loc, tree_type);
+    copy_cu_pixels(&work_tree[depth], &work_tree[i], cu_loc, tree_type);
   }
 }
 
-void uvg_lcu_fill_trdepth(lcu_t *lcu, int x_px, int y_px, int depth, uint8_t tr_depth, enum uvg_tree_type
-                          tree_type)
+void uvg_lcu_fill_trdepth(
+  lcu_t *lcu,
+  const cu_loc_t* const cu_loc,
+  uint8_t tr_depth,
+  enum uvg_tree_type
+  tree_type)
 {
-  const int x_local = SUB_SCU(x_px);
-  const int y_local = SUB_SCU(y_px);
-  const unsigned width = (tree_type != UVG_CHROMA_T ? LCU_WIDTH  : LCU_WIDTH_C) >> depth;
+  const int x_local = cu_loc->local_x >> (tree_type == UVG_CHROMA_T);
+  const int y_local = cu_loc->local_y >> (tree_type == UVG_CHROMA_T);
+  const unsigned width = tree_type != UVG_CHROMA_T ? cu_loc->width  : cu_loc->chroma_width;
+  const unsigned height = tree_type != UVG_CHROMA_T ? cu_loc->height : cu_loc->chroma_height;
 
-  for (unsigned y = 0; y < width; y += SCU_WIDTH) {
+  for (unsigned y = 0; y < height; y += SCU_WIDTH) {
     for (unsigned x = 0; x < width; x += SCU_WIDTH) {
       LCU_GET_CU_AT_PX(lcu, x_local + x, y_local + y)->tr_depth = tr_depth;
     }
@@ -167,6 +186,7 @@ static void lcu_fill_cu_info(lcu_t *lcu, int x_local, int y_local, int width, in
       to->type      = cu->type;
       to->depth     = cu->depth;
       to->qp        = cu->qp;
+      to->split_tree = cu->split_tree;
       //to->tr_idx    = cu->tr_idx;
       to->lfnst_idx = cu->lfnst_idx;
       to->lfnst_last_scan_pos = cu->lfnst_last_scan_pos;
@@ -214,34 +234,37 @@ static void lcu_fill_cbf(lcu_t *lcu, int x_local, unsigned y_local, unsigned wid
 
 
 //Calculates cost for all zero coeffs
-static double cu_zero_coeff_cost(const encoder_state_t *state, lcu_t *work_tree, const int x, const int y,
+static double cu_zero_coeff_cost(
+  const encoder_state_t *state,
+  lcu_t *work_tree,
+  const cu_loc_t* const cu_loc,
   const int depth)
 {
-  int x_local = SUB_SCU(x);
-  int y_local = SUB_SCU(y);
-  int cu_width = LCU_WIDTH >> depth;
   lcu_t *const lcu = &work_tree[depth];
 
+  const int y_local = cu_loc->local_y;
+  const int x_local = cu_loc->local_x;
+
   const int luma_index = y_local * LCU_WIDTH + x_local;
   const int chroma_index = (y_local / 2) * LCU_WIDTH_C + (x_local / 2);
 
   double ssd = 0.0;
   ssd += UVG_LUMA_MULT * uvg_pixels_calc_ssd(
     &lcu->ref.y[luma_index], &lcu->rec.y[luma_index],
-    LCU_WIDTH, LCU_WIDTH, cu_width
+    LCU_WIDTH, LCU_WIDTH, cu_loc->width
     );
-  if (x % 8 == 0 && y % 8 == 0 && state->encoder_control->chroma_format != UVG_CSP_400) {
+  if (y_local % 8 == 0 && x_local % 8 == 0 && state->encoder_control->chroma_format != UVG_CSP_400) {
     ssd += UVG_CHROMA_MULT * uvg_pixels_calc_ssd(
       &lcu->ref.u[chroma_index], &lcu->rec.u[chroma_index],
-      LCU_WIDTH_C, LCU_WIDTH_C, cu_width / 2
+      LCU_WIDTH_C, LCU_WIDTH_C, cu_loc->chroma_width
       );
     ssd += UVG_CHROMA_MULT * uvg_pixels_calc_ssd(
       &lcu->ref.v[chroma_index], &lcu->rec.v[chroma_index],
-      LCU_WIDTH_C, LCU_WIDTH_C, cu_width / 2
+      LCU_WIDTH_C, LCU_WIDTH_C, cu_loc->chroma_width
       );
   }
   // Save the pixels at a lower level of the working tree.
-  copy_cu_pixels(x_local, y_local, cu_width, lcu, &work_tree[depth + 1], UVG_BOTH_T);
+  copy_cu_pixels(lcu, &work_tree[depth + 1], cu_loc, UVG_BOTH_T);
 
   return ssd;
 }
@@ -295,46 +318,45 @@ static void downsample_cclm_rec(encoder_state_t *state, int x, int y, int width,
 * Takes into account SSD of reconstruction and the cost of encoding whatever
 * prediction unit data needs to be coded.
 */
-double uvg_cu_rd_cost_luma(const encoder_state_t *const state,
-                           const int x_px, const int y_px, const int depth,
-                           const cu_info_t *const pred_cu,
-                           lcu_t *const lcu,
-                           uint8_t isp_cbf)
+double uvg_cu_rd_cost_luma(
+  const encoder_state_t *const state,
+  const cu_loc_t* const cu_loc,
+  const cu_info_t *const pred_cu,
+  lcu_t *const lcu,
+  uint8_t isp_cbf)
 {
-  const int width  = LCU_WIDTH >> depth;
-  const int height = width; // TODO: height for non-square blocks
   const int skip_residual_coding = pred_cu->skipped || (pred_cu->type != CU_INTRA && pred_cu->cbf == 0);
   cabac_data_t* cabac = (cabac_data_t *)&state->search_cabac;
-
-  cu_loc_t loc;
-  uvg_cu_loc_ctor(&loc, x_px, y_px, width, height);
-
+  
   // cur_cu is used for TU parameters.
-  cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, x_px, y_px);
+  cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, cu_loc->local_x, cu_loc->local_y);
 
   double coeff_bits = 0;
   double tr_tree_bits = 0;
 
-  // Check that lcu is not in 
-  assert(x_px >= 0 && x_px < LCU_WIDTH);
-  assert(y_px >= 0 && y_px < LCU_WIDTH);
+  // Check that lcu is not in   
 
-  const uint8_t tr_depth = tr_cu->tr_depth - depth;
-
-  if (tr_depth > 0) {
-    int offset = width / 2;
+  if (cu_loc->width > TR_MAX_WIDTH || cu_loc->height > TR_MAX_WIDTH) {
     double sum = 0;
+    const int half_width = cu_loc->width >> 1;
+    const int half_height = cu_loc->height >> 1;
+    cu_loc_t split_cu_loc;
 
-    sum += uvg_cu_rd_cost_luma(state, x_px, y_px, depth + 1, pred_cu, lcu, isp_cbf);
-    sum += uvg_cu_rd_cost_luma(state, x_px + offset, y_px, depth + 1, pred_cu, lcu, isp_cbf);
-    sum += uvg_cu_rd_cost_luma(state, x_px, y_px + offset, depth + 1, pred_cu, lcu, isp_cbf);
-    sum += uvg_cu_rd_cost_luma(state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu, isp_cbf);
+    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x, cu_loc->y, half_width, half_height);
+    sum += uvg_cu_rd_cost_luma(state, &split_cu_loc, pred_cu, lcu, isp_cbf);
+    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x + half_width, cu_loc->y, half_width, half_height);
+    sum += uvg_cu_rd_cost_luma(state, &split_cu_loc, pred_cu, lcu, isp_cbf);
+    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x, cu_loc->y+ half_height, half_width, half_height);
+    sum += uvg_cu_rd_cost_luma(state, &split_cu_loc, pred_cu, lcu, isp_cbf);
+    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x + half_width, cu_loc->y + half_height, half_width, half_height);
+    sum += uvg_cu_rd_cost_luma(state, &split_cu_loc, pred_cu, lcu, isp_cbf);
 
     return sum + tr_tree_bits * state->lambda;
   }
 
   // Add transform_tree cbf_luma bit cost.
   if (pred_cu->type == CU_INTER || pred_cu->intra.isp_mode == ISP_MODE_NO_ISP) {
+    const int depth = 6 - uvg_g_convert_to_log2[cu_loc->width];
     const int is_tr_split = tr_cu->tr_depth - tr_cu->depth;
     int is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_Y);
     if (pred_cu->type == CU_INTRA ||
@@ -347,7 +369,9 @@ double uvg_cu_rd_cost_luma(const encoder_state_t *const state,
       CABAC_FBITS_UPDATE(cabac, ctx, is_set, tr_tree_bits, "cbf_y_search");
     }
 
-    if (is_set && state->encoder_control->cfg.trskip_enable && width <= (1 << state->encoder_control->cfg.trskip_max_size)) {
+    if (is_set && state->encoder_control->cfg.trskip_enable 
+      && cu_loc->width <= (1 << state->encoder_control->cfg.trskip_max_size)
+      && cu_loc->height <= (1 << state->encoder_control->cfg.trskip_max_size)) {
       CABAC_FBITS_UPDATE(cabac, &cabac->ctx.transform_skip_model_luma, pred_cu->tr_idx == MTS_SKIP, tr_tree_bits, "transform_skip_flag");
     }
   }
@@ -367,28 +391,28 @@ double uvg_cu_rd_cost_luma(const encoder_state_t *const state,
   // SSD between reconstruction and original
   int ssd = 0;
   if (!state->encoder_control->cfg.lossless) {
-    int index = y_px * LCU_WIDTH + x_px;
+    int index = cu_loc->local_y * LCU_WIDTH + cu_loc->local_x;
     ssd = uvg_pixels_calc_ssd(&lcu->ref.y[index], &lcu->rec.y[index],
                                         LCU_WIDTH,          LCU_WIDTH,
-                                        width);
+                                        cu_loc->width);
   }
 
 
   if (!skip_residual_coding) {
-    int8_t luma_scan_mode = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode, depth);
+    int8_t luma_scan_mode = SCAN_DIAG;
     if (pred_cu->type == CU_INTER || pred_cu->intra.isp_mode == ISP_MODE_NO_ISP) {
       //const coeff_t* coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, x_px, y_px)];
       const coeff_t* coeffs = lcu->coeff.y;
 
-      coeff_bits += uvg_get_coeff_cost(state, coeffs, NULL, &loc, 0, luma_scan_mode, pred_cu->tr_idx == MTS_SKIP, COEFF_ORDER_CU);
+      coeff_bits += uvg_get_coeff_cost(state, coeffs, NULL, cu_loc, 0, luma_scan_mode, pred_cu->tr_idx == MTS_SKIP, COEFF_ORDER_CU);
     }
     else {
       int split_type = pred_cu->intra.isp_mode;
-      int split_limit = uvg_get_isp_split_num(width, height, split_type, true);
+      int split_limit = uvg_get_isp_split_num(cu_loc->width, cu_loc->height, split_type, true);
 
       for (int i = 0; i < split_limit; ++i) {
         cu_loc_t split_loc;
-        uvg_get_isp_split_loc(&split_loc, x_px, y_px, width, height, i, split_type, true);
+        uvg_get_isp_split_loc(&split_loc, cu_loc->x, cu_loc->y,  cu_loc->width, cu_loc->height, i, split_type, true);
         const int part_x = split_loc.x;
         const int part_y = split_loc.y;
 
@@ -406,34 +430,32 @@ double uvg_cu_rd_cost_luma(const encoder_state_t *const state,
 }
 
 
-double uvg_cu_rd_cost_chroma(const encoder_state_t *const state,
-                             const int x_px, const int y_px, const int depth,
-                             cu_info_t *const pred_cu,
-                             lcu_t *const lcu)
+double uvg_cu_rd_cost_chroma(
+  const encoder_state_t *const state,
+  cu_info_t *const pred_cu,
+  lcu_t *const lcu,
+  const cu_loc_t * const cu_loc)
 {
-  const vector2d_t lcu_px = { (x_px & ~7) / 2, (y_px & ~7) / 2 };
-  const int width  = (depth < MAX_DEPTH) ? LCU_WIDTH >> (depth + 1) : LCU_WIDTH >> depth;
-  const int height = width; // TODO: height for non-square blocks
-  cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, x_px, y_px);
+  const vector2d_t lcu_px = { (cu_loc->local_x & ~7) / 2, (cu_loc->local_y & ~7) / 2 };
+  const int width = cu_loc->chroma_width;
+  const int height = cu_loc->chroma_height;
+  cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y);
   const int skip_residual_coding = pred_cu->skipped || (pred_cu->type != CU_INTRA && pred_cu->cbf == 0);
-
-  cu_loc_t loc;
-  uvg_cu_loc_ctor(&loc, x_px, y_px, width, height);
-
+  
   double tr_tree_bits = 0;
   double coeff_bits = 0;
-
-  assert(x_px >= 0 && x_px < LCU_WIDTH);
-  assert(y_px >= 0 && y_px < LCU_WIDTH);
-
-  if (depth == 4 && (x_px % 8 == 0 || y_px % 8 == 0)) {
+  
+  if (cu_loc->width == 4 && cu_loc->height == 4 && (cu_loc->x % 8 == 0 || cu_loc->y % 8 == 0)) {
     // For MAX_PU_DEPTH calculate chroma for previous depth for the first
     // block and return 0 cost for all others.
     return 0;
   }
+
+  const int depth = 6 - uvg_g_convert_to_log2[cu_loc->width];
   int u_is_set = pred_cu->joint_cb_cr ? (pred_cu->joint_cb_cr & 2) >> 1 : cbf_is_set(pred_cu->cbf, depth, COLOR_U);
   int v_is_set = pred_cu->joint_cb_cr ? (pred_cu->joint_cb_cr & 1) : cbf_is_set(pred_cu->cbf, depth, COLOR_V);
 
+
   // See luma for why the second condition
   if (!skip_residual_coding) {
     const int tr_depth = depth - pred_cu->depth;
@@ -450,14 +472,21 @@ double uvg_cu_rd_cost_chroma(const encoder_state_t *const state,
   }
 
 
-  if (tr_cu->tr_depth > depth) {
-    int offset = LCU_WIDTH >> (depth + 1);
+  if (cu_loc->width > TR_MAX_WIDTH || cu_loc->height > TR_MAX_WIDTH) {
     double sum = 0;
+    // Recursively process sub-CUs.
+    const int half_width = cu_loc->width >> 1;
+    const int half_height = cu_loc->height >> 1;
+    cu_loc_t split_cu_loc;
 
-    sum += uvg_cu_rd_cost_chroma(state, x_px, y_px, depth + 1, pred_cu, lcu);
-    sum += uvg_cu_rd_cost_chroma(state, x_px + offset, y_px, depth + 1, pred_cu, lcu);
-    sum += uvg_cu_rd_cost_chroma(state, x_px, y_px + offset, depth + 1, pred_cu, lcu);
-    sum += uvg_cu_rd_cost_chroma(state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu);
+    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x, cu_loc->y, half_width, half_height);
+    sum += uvg_cu_rd_cost_chroma(state, pred_cu, lcu, &split_cu_loc);
+    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x + half_width, cu_loc->y, half_width, half_height);
+    sum += uvg_cu_rd_cost_chroma(state, pred_cu, lcu, &split_cu_loc);
+    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x, cu_loc->y + half_height, half_width, half_height);
+    sum += uvg_cu_rd_cost_chroma(state, pred_cu, lcu, &split_cu_loc);
+    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x + half_width, cu_loc->y + half_height, half_width, half_height);
+    sum += uvg_cu_rd_cost_chroma(state, pred_cu, lcu, &split_cu_loc);
 
     return sum + tr_tree_bits * state->lambda;
   }
@@ -487,14 +516,17 @@ double uvg_cu_rd_cost_chroma(const encoder_state_t *const state,
 
   if (!skip_residual_coding) {
     int8_t scan_order = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode_chroma, depth);
-    //const int index = xy_to_zorder(LCU_WIDTH_C, lcu_px.x, lcu_px.y);
+
+    // We need the rounded & shifted coordinates for the chroma coeff calculation
+    cu_loc_t chroma_loc;
+    uvg_cu_loc_ctor(&chroma_loc, lcu_px.x, lcu_px.y, cu_loc->width, cu_loc->height);
 
     if((pred_cu->joint_cb_cr & 3) == 0){
-      coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.u, NULL, &loc, 2, scan_order, 0, COEFF_ORDER_CU);
-      coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.v, NULL, &loc, 2, scan_order, 0, COEFF_ORDER_CU);
+      coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.u, NULL, cu_loc, 2, scan_order, 0, COEFF_ORDER_CU);
+      coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.v, NULL, cu_loc, 2, scan_order, 0, COEFF_ORDER_CU);
     }
     else {
-      coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.joint_uv, NULL, &loc, 2, scan_order, 0, COEFF_ORDER_CU);
+      coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.joint_uv, NULL, cu_loc, 2, scan_order, 0, COEFF_ORDER_CU);
       
     }
   }
@@ -507,39 +539,30 @@ double uvg_cu_rd_cost_chroma(const encoder_state_t *const state,
 
 static double cu_rd_cost_tr_split_accurate(
   const encoder_state_t* const state,
-  const int x_px,
-  const int y_px,
-  const int depth,
   const cu_info_t* const pred_cu,
   lcu_t* const lcu,
   enum uvg_tree_type tree_type,
-  uint8_t isp_cbf) {
-  const int width = LCU_WIDTH >> depth;
-  const int height = width; // TODO: height for non-square blocks
-
-  cu_loc_t loc;
-  uvg_cu_loc_ctor(&loc, x_px, y_px, width, height);
-
+  uint8_t isp_cbf,
+  const cu_loc_t* const cu_loc) {
+  const int width = cu_loc->width;
+  const int height = cu_loc->height; // TODO: height for non-square blocks
+  
   const int skip_residual_coding = pred_cu->skipped || (pred_cu->type != CU_INTRA && pred_cu->cbf == 0);
   // cur_cu is used for TU parameters.
-  cu_info_t* const tr_cu = LCU_GET_CU_AT_PX(lcu, x_px, y_px);
+  cu_info_t* const tr_cu = LCU_GET_CU_AT_PX(lcu, cu_loc->local_x >> (tree_type == UVG_CHROMA_T), cu_loc->local_y >> (tree_type == UVG_CHROMA_T));
 
   double coeff_bits = 0;
   double tr_tree_bits = 0;
 
-  // Check that lcu is not in 
-  assert(x_px >= 0 && x_px < LCU_WIDTH);
-  assert(y_px >= 0 && y_px < LCU_WIDTH);
-
-  const uint8_t tr_depth = tr_cu->tr_depth - depth;
-
+  const int depth = 6 - uvg_g_convert_to_log2[cu_loc->width];
+  
   const int cb_flag_u = tr_cu->joint_cb_cr ? tr_cu->joint_cb_cr >> 1 : cbf_is_set(tr_cu->cbf, depth, COLOR_U);
   const int cb_flag_v = tr_cu->joint_cb_cr ? tr_cu->joint_cb_cr & 1 : cbf_is_set(tr_cu->cbf, depth, COLOR_V);
 
   cabac_data_t* cabac = (cabac_data_t*)&state->search_cabac;
 
   {
-    int cbf = cbf_is_set_any(pred_cu->cbf, depth);
+    int cbf = cbf_is_set_any(tr_cu->cbf, depth);
     // Only need to signal coded block flag if not skipped or merged
     // skip = no coded residual, merge = coded residual
     if (pred_cu->type != CU_INTRA && (!pred_cu->merged)) {
@@ -548,24 +571,30 @@ static double cu_rd_cost_tr_split_accurate(
 
   }
 
-  bool has_chroma = state->encoder_control->chroma_format != UVG_CSP_400 && (depth != 4 || (x_px % 8 && y_px % 8)) && tree_type != UVG_LUMA_T;
+  bool has_chroma = state->encoder_control->chroma_format != UVG_CSP_400 && (depth != 4 || (cu_loc->x % 8 && cu_loc->y % 8)) && tree_type != UVG_LUMA_T;
   if( !skip_residual_coding && has_chroma) {
-    if(tr_cu->depth == depth || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_U)) {
+    if(tr_cu->tr_depth == depth) {
       CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_cbf_model_cb[0]), cb_flag_u, tr_tree_bits, "cbf_cb");
     } 
-    if(tr_cu->depth == depth || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_V)) {
+    if(tr_cu->tr_depth == depth) {
       CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_cbf_model_cr[cb_flag_u]), cb_flag_v, tr_tree_bits, "cbf_cr");
     } 
   }
 
-  if (tr_depth > 0) {
-    int offset = LCU_WIDTH >> (depth + 1);
+  if (cu_loc->width > TR_MAX_WIDTH || cu_loc->height > TR_MAX_WIDTH) {
     double sum = 0;
 
-    sum += cu_rd_cost_tr_split_accurate(state, x_px, y_px, depth + 1, pred_cu, lcu, tree_type, isp_cbf);
-    sum += cu_rd_cost_tr_split_accurate(state, x_px + offset, y_px, depth + 1, pred_cu, lcu, tree_type, isp_cbf);
-    sum += cu_rd_cost_tr_split_accurate(state, x_px, y_px + offset, depth + 1, pred_cu, lcu, tree_type, isp_cbf);
-    sum += cu_rd_cost_tr_split_accurate(state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu, tree_type, isp_cbf);
+    const int half_width = cu_loc->width >> 1;
+    const int half_height = cu_loc->height >> 1;
+    cu_loc_t split_cu_loc;
+    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x, cu_loc->y, half_width, half_height);
+    sum += cu_rd_cost_tr_split_accurate(state, pred_cu, lcu, tree_type, isp_cbf, &split_cu_loc);
+    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x + half_width, cu_loc->y, half_width, half_height);
+    sum += cu_rd_cost_tr_split_accurate(state, pred_cu, lcu, tree_type, isp_cbf, &split_cu_loc);
+    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x, cu_loc->y + half_height, half_width, half_height);
+    sum += cu_rd_cost_tr_split_accurate(state, pred_cu, lcu, tree_type, isp_cbf, &split_cu_loc);
+    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x + half_width, cu_loc->y + half_height, half_width, half_height);
+    sum += cu_rd_cost_tr_split_accurate(state, pred_cu, lcu, tree_type, isp_cbf, &split_cu_loc);
     return sum + tr_tree_bits * state->lambda;
   }
   const int cb_flag_y = cbf_is_set(tr_cu->cbf, depth, COLOR_Y) && tree_type != UVG_CHROMA_T;
@@ -573,7 +602,7 @@ static double cu_rd_cost_tr_split_accurate(
   const bool is_isp = !(pred_cu->type == CU_INTER || pred_cu->intra.isp_mode == ISP_MODE_NO_ISP);
   // Add transform_tree cbf_luma bit cost.
   if (!is_isp) {
-    const int is_tr_split = depth - tr_cu->depth;
+    const int is_tr_split = cu_loc->width > TR_MAX_WIDTH || cu_loc->height > TR_MAX_WIDTH;
     if ((pred_cu->type == CU_INTRA ||
       is_tr_split ||
       cb_flag_u ||
@@ -610,7 +639,7 @@ static double cu_rd_cost_tr_split_accurate(
   // SSD between reconstruction and original
   unsigned luma_ssd = 0;
   if (!state->encoder_control->cfg.lossless && tree_type != UVG_CHROMA_T) {
-    int index = y_px * LCU_WIDTH + x_px;
+    int index = cu_loc->local_x + LCU_WIDTH * cu_loc->local_y;
     luma_ssd = uvg_pixels_calc_ssd(&lcu->ref.y[index], &lcu->rec.y[index],
       LCU_WIDTH, LCU_WIDTH,
       width);
@@ -623,12 +652,12 @@ static double cu_rd_cost_tr_split_accurate(
     if (can_use_tr_skip) {
       CABAC_FBITS_UPDATE(cabac, &cabac->ctx.transform_skip_model_luma, tr_cu->tr_idx == MTS_SKIP, tr_tree_bits, "transform_skip_flag");
     }
-    int8_t luma_scan_mode = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode, depth);
+    int8_t luma_scan_mode = SCAN_DIAG;
     if (pred_cu->type == CU_INTER || pred_cu->intra.isp_mode == ISP_MODE_NO_ISP) {
       //const coeff_t* coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, x_px, y_px)];
       const coeff_t* coeffs = lcu->coeff.y;
 
-      coeff_bits += uvg_get_coeff_cost(state, coeffs, tr_cu, &loc, 0, luma_scan_mode, pred_cu->tr_idx == MTS_SKIP, COEFF_ORDER_CU);
+      coeff_bits += uvg_get_coeff_cost(state, coeffs, tr_cu, cu_loc, 0, luma_scan_mode, pred_cu->tr_idx == MTS_SKIP, COEFF_ORDER_CU);
     }
     else {
       int split_type = pred_cu->intra.isp_mode;
@@ -636,7 +665,7 @@ static double cu_rd_cost_tr_split_accurate(
 
       for (int i = 0; i < split_limit; ++i) {
         cu_loc_t split_loc;
-        uvg_get_isp_split_loc(&split_loc, x_px, y_px, width, height, i, split_type, true);
+        uvg_get_isp_split_loc(&split_loc, cu_loc->x, cu_loc->y, width, height, i, split_type, true);
         const int part_x = split_loc.x;
         const int part_y = split_loc.y;
 
@@ -649,8 +678,8 @@ static double cu_rd_cost_tr_split_accurate(
     }
   }
 
-  if(depth == 4 || tree_type == UVG_LUMA_T) {
-    if (uvg_is_lfnst_allowed(state, tr_cu, width, width, x_px, y_px, tree_type, COLOR_Y, lcu)) {
+  if(cu_loc->width == 4 || tree_type == UVG_LUMA_T) {
+    if (uvg_is_lfnst_allowed(state, tr_cu, width, height, cu_loc->local_x, cu_loc->local_y, tree_type, COLOR_Y, lcu)) {
       const int lfnst_idx = tr_cu->lfnst_idx;
       CABAC_FBITS_UPDATE(
         cabac,
@@ -672,14 +701,17 @@ static double cu_rd_cost_tr_split_accurate(
 
   unsigned chroma_ssd = 0;
   if(has_chroma) {
-    const vector2d_t lcu_px = { (x_px >> (tree_type != UVG_CHROMA_T)) & ~3, (y_px >> (tree_type != UVG_CHROMA_T)) &~3  };
-    uvg_cu_loc_ctor(&loc, lcu_px.x, lcu_px.y, width, height);
-    const int chroma_width  = MAX(4, LCU_WIDTH >> (depth + 1));
-    const int chroma_height = chroma_width; // TODO: height for non-square blocks
-    int8_t scan_order = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode_chroma, depth);
+    cu_loc_t chroma_loc;
+    const vector2d_t lcu_px = { (cu_loc->local_x >> 1) & ~3, (cu_loc->local_y >> 1) &~3  };
+    uvg_cu_loc_ctor(&chroma_loc, lcu_px.x, lcu_px.y, width, height);
+    const int chroma_width  = cu_loc->chroma_width;
+    const int chroma_height = cu_loc->chroma_height; // TODO: height for non-square blocks
+    int8_t scan_order = SCAN_DIAG;
     //const unsigned index = xy_to_zorder(LCU_WIDTH_C, lcu_px.x, lcu_px.y);
 
-    const bool chroma_can_use_tr_skip = state->encoder_control->cfg.trskip_enable && chroma_width <= (1 << state->encoder_control->cfg.trskip_max_size);
+    const bool chroma_can_use_tr_skip = state->encoder_control->cfg.trskip_enable
+      && chroma_width <= (1 << state->encoder_control->cfg.trskip_max_size)
+      && chroma_height <= (1 << state->encoder_control->cfg.trskip_max_size);
     if(pred_cu->joint_cb_cr == 0) {
       if (!state->encoder_control->cfg.lossless) {
         int index = lcu_px.y * LCU_WIDTH_C + lcu_px.x;
@@ -697,8 +729,8 @@ static double cu_rd_cost_tr_split_accurate(
       if(chroma_can_use_tr_skip && cb_flag_v) {
         CABAC_FBITS_UPDATE(cabac, &cabac->ctx.transform_skip_model_chroma, tr_cu->tr_skip & 4, tr_tree_bits, "transform_skip_flag");        
       }
-      coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.u, tr_cu, &loc, COLOR_U, scan_order, tr_cu->tr_skip & 2, COEFF_ORDER_CU);
-      coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.v, tr_cu, &loc, COLOR_V, scan_order, tr_cu->tr_skip & 4, COEFF_ORDER_CU);
+      coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.u, tr_cu, &chroma_loc, COLOR_U, scan_order, tr_cu->tr_skip & 2, COEFF_ORDER_CU);
+      coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.v, tr_cu, &chroma_loc, COLOR_V, scan_order, tr_cu->tr_skip & 4, COEFF_ORDER_CU);
       
     }
     else {
@@ -715,12 +747,12 @@ static double cu_rd_cost_tr_split_accurate(
       if (chroma_can_use_tr_skip) {
         CABAC_FBITS_UPDATE(cabac, &cabac->ctx.transform_skip_model_chroma, tr_cu->tr_skip & 2, tr_tree_bits, "transform_skip_flag");
       }
-      coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.joint_uv, tr_cu, &loc, COLOR_U, scan_order, 0, COEFF_ORDER_CU);
+      coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.joint_uv, tr_cu, &chroma_loc, COLOR_U, scan_order, 0, COEFF_ORDER_CU);
     }
   }
 
-  if (uvg_is_lfnst_allowed(state, tr_cu, width, height, x_px, y_px, tree_type, depth == 4 || tree_type == UVG_CHROMA_T ? COLOR_UV : COLOR_Y, lcu)) {
-    const int lfnst_idx = (depth != 4 && tree_type != UVG_CHROMA_T) ? tr_cu->lfnst_idx : tr_cu->cr_lfnst_idx;
+  if (uvg_is_lfnst_allowed(state, tr_cu, width, height, cu_loc->local_x, cu_loc->local_y, tree_type, cu_loc->width == 4 || tree_type == UVG_CHROMA_T ? COLOR_UV : COLOR_Y, lcu)) {
+    const int lfnst_idx = (cu_loc->width != 4 && tree_type != UVG_CHROMA_T) ? tr_cu->lfnst_idx : tr_cu->cr_lfnst_idx;
     CABAC_FBITS_UPDATE(
       cabac,
       &cabac->ctx.lfnst_idx_model[tr_cu->depth == 4 || tree_type != UVG_BOTH_T],
@@ -739,7 +771,7 @@ static double cu_rd_cost_tr_split_accurate(
   tr_cu->lfnst_last_scan_pos = false;
   tr_cu->violates_lfnst_constrained_luma = false;
   tr_cu->violates_lfnst_constrained_chroma = false;
-  if (uvg_is_mts_allowed(state, tr_cu) && tree_type != UVG_CHROMA_T) {
+  if (uvg_is_mts_allowed(state, tr_cu, cu_loc) && tree_type != UVG_CHROMA_T) {
 
     bool symbol = tr_cu->tr_idx != 0;
     int ctx_idx = 0;
@@ -1035,10 +1067,6 @@ static double search_cu(
         if ((split_tree.current_depth != 4 || (x % 8 && y % 8)) && state->encoder_control->chroma_format != UVG_CSP_400 && tree_type != UVG_LUMA_T) {
 
           intra_search.pred_cu.joint_cb_cr = 0;
-          // There is almost no benefit to doing the chroma mode search for
-          // rd2. Possibly because the luma mode search already takes chroma
-          // into account, so there is less of a chanse of luma mode being
-          // really bad for chroma.
           if(tree_type == UVG_CHROMA_T) {
             intra_search.pred_cu.intra = uvg_get_co_located_luma_cu(x, y, luma_width, luma_width, NULL, state->tile->frame->cu_array, UVG_CHROMA_T)->intra;
             intra_mode = intra_search.pred_cu.intra.mode;
@@ -1046,7 +1074,7 @@ static double search_cu(
           }
           intra_search.pred_cu.intra.mode_chroma = intra_search.pred_cu.intra.mode;
           if (ctrl->cfg.rdo >= 2 || ctrl->cfg.jccr || ctrl->cfg.lfnst) {
-            uvg_search_cu_intra_chroma(state, x, y, depth, lcu, &intra_search, tree_type);
+            uvg_search_cu_intra_chroma(state, cu_loc, lcu, &intra_search, tree_type);
 
             if (intra_search.pred_cu.joint_cb_cr == 0) {
               intra_search.pred_cu.joint_cb_cr = 4;
@@ -1066,7 +1094,7 @@ static double search_cu(
                              false,
                              true);
           if(tree_type != UVG_CHROMA_T) {
-            intra_cost += uvg_cu_rd_cost_chroma(state, x_local, y_local, depth, &intra_search.pred_cu, lcu);
+            intra_cost += uvg_cu_rd_cost_chroma(state, &intra_search.pred_cu, lcu, cu_loc);
           }
           else {
             intra_cost = intra_search.cost;
@@ -1080,7 +1108,7 @@ static double search_cu(
         }
         intra_search.pred_cu.intra.mode = intra_mode;
         if(tree_type == UVG_CHROMA_T) {
-          uvg_lcu_fill_trdepth(lcu, x_local, y_local, split_tree.current_depth, split_tree.current_depth, tree_type);
+          uvg_lcu_fill_trdepth(lcu, cu_loc, split_tree.current_depth, tree_type);
         }
       }
       if (intra_cost < cost) {
@@ -1187,14 +1215,14 @@ static double search_cu(
         // This will no longer be necessary if the transform depths are not shared.
         int tr_depth = MAX(1, split_tree.current_depth);
 
-        uvg_lcu_fill_trdepth(lcu, x, y, depth, tr_depth, tree_type);
+        uvg_lcu_fill_trdepth(lcu, cu_loc, tr_depth, tree_type);
 
         const bool has_chroma = state->encoder_control->chroma_format != UVG_CSP_400;
         uvg_inter_recon_cu(state, lcu, true, has_chroma, cu_loc);
 
         if (ctrl->cfg.zero_coeff_rdo && !ctrl->cfg.lossless && !ctrl->cfg.rdoq_enable) {
           //Calculate cost for zero coeffs
-          inter_zero_coeff_cost = cu_zero_coeff_cost(state, work_tree, x, y, split_tree.current_depth) + inter_bitcost * state->lambda;
+          inter_zero_coeff_cost = cu_zero_coeff_cost(state, work_tree, cu_loc, split_tree.current_depth) + inter_bitcost * state->lambda;
 
         }
         cu_loc_t loc;
@@ -1239,13 +1267,13 @@ static double search_cu(
     
     cost = bits * state->lambda;
 
-    cost += cu_rd_cost_tr_split_accurate(state, x_local, y_local, depth, cur_cu, lcu, tree_type, 0);
+    cost += cu_rd_cost_tr_split_accurate(state, cur_cu, lcu, tree_type, 0, cu_loc);
     
     if (ctrl->cfg.zero_coeff_rdo && inter_zero_coeff_cost <= cost) {
       cost = inter_zero_coeff_cost;
 
       // Restore saved pixels from lower level of the working tree.
-      copy_cu_pixels(x_local, y_local, cu_width, &work_tree[split_tree.current_depth + 1], lcu, tree_type);
+      copy_cu_pixels(&work_tree[split_tree.current_depth + 1], lcu, cu_loc, tree_type);
 
       if (cur_cu->merged) {
         cur_cu->merged = 0;
@@ -1256,7 +1284,7 @@ static double search_cu(
       if (cur_cu->tr_depth != 0) {
         // Reset transform depth since there are no coefficients. This
         // ensures that CBF is cleared for the whole area of the CU.
-        uvg_lcu_fill_trdepth(lcu, x, y, depth, depth, tree_type);
+        uvg_lcu_fill_trdepth(lcu, cu_loc, depth, tree_type);
       }
 
       cur_cu->cbf = 0;
@@ -1317,10 +1345,8 @@ static double search_cu(
         left_cu,
         above_cu,
         1,
+        cu_loc,
         depth,
-        cu_width,
-        x >> (tree_type == UVG_CHROMA_T),
-        y >> (tree_type == UVG_CHROMA_T),
         tree_type,
         &split_bits);
     }
@@ -1380,8 +1406,7 @@ static double search_cu(
         uvg_write_split_flag(state, &state->search_cabac,
                              x > 0 ? LCU_GET_CU_AT_PX(lcu, SUB_SCU(x) - 1, SUB_SCU(y)) : NULL,
                              y > 0 ? LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y) - 1) : NULL,
-                             0, depth, cu_width, x, y, tree_type,
-                             &bits);
+                             0, cu_loc, depth, tree_type, &bits);
 
         cur_cu->intra = cu_d1->intra;
         cur_cu->type = CU_INTRA;
@@ -1391,7 +1416,7 @@ static double search_cu(
         cur_cu->lfnst_idx = 0;
         cur_cu->cr_lfnst_idx = 0;
 
-        uvg_lcu_fill_trdepth(lcu, x, y, depth, cur_cu->tr_depth, tree_type);
+        uvg_lcu_fill_trdepth(lcu, cu_loc, cur_cu->tr_depth, tree_type);
         lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu);
         
         intra_search_data_t proxy;
@@ -1404,12 +1429,12 @@ static double search_cu(
                            lcu,
                            tree_type,
                            true,
-                           state->encoder_control->chroma_format == UVG_CSP_400);
+                           state->encoder_control->chroma_format != UVG_CSP_400);
 
         double mode_bits = calc_mode_bits(state, lcu, cur_cu, cu_loc) + bits;
         cost += mode_bits * state->lambda;
 
-        cost += cu_rd_cost_tr_split_accurate(state, x_local, y_local, depth, cur_cu, lcu, tree_type, 0);
+        cost += cu_rd_cost_tr_split_accurate(state, cur_cu, lcu, tree_type, 0, cu_loc);
 
         memcpy(&post_seach_cabac, &state->search_cabac, sizeof(post_seach_cabac));
         memcpy(&state->search_cabac, &temp_cabac, sizeof(temp_cabac));
@@ -1419,7 +1444,7 @@ static double search_cu(
     if (split_cost < cost) {
       // Copy split modes to this depth.
       cost = split_cost;
-      work_tree_copy_up(x_local, y_local, depth, work_tree, state->encoder_control->cfg.jccr, tree_type);
+      work_tree_copy_up(work_tree, state->encoder_control->cfg.jccr, tree_type, cu_loc, depth);
 #if UVG_DEBUG
       //debug_split = 1;
 #endif
@@ -1427,7 +1452,7 @@ static double search_cu(
       // Copy this CU's mode all the way down for use in adjacent CUs mode
       // search.
       memcpy(&state->search_cabac, &post_seach_cabac, sizeof(post_seach_cabac));
-      work_tree_copy_down(x_local, y_local, depth, work_tree, tree_type);
+      work_tree_copy_down(depth, work_tree, tree_type, cu_loc);
       downsample_cclm_rec(
         state, x, y, cu_width / 2, cu_width / 2, lcu->rec.y, lcu->left_ref.y[64]
       );
@@ -1454,7 +1479,7 @@ static double search_cu(
   } else if (depth >= 0 && depth < MAX_PU_DEPTH) {
     // Need to copy modes down since the lower level of the work tree is used
     // when searching SMP and AMP blocks.
-    work_tree_copy_down(x_local, y_local, depth, work_tree, tree_type);
+    work_tree_copy_down(depth, work_tree, tree_type, cu_loc);
     if(tree_type != UVG_CHROMA_T) {
       downsample_cclm_rec(
         state, x, y, cu_width / 2, cu_width / 2, lcu->rec.y, lcu->left_ref.y[64]
diff --git a/src/search.h b/src/search.h
index 1a013670..73c7efec 100644
--- a/src/search.h
+++ b/src/search.h
@@ -84,18 +84,24 @@ void uvg_sort_keys_by_cost(unit_stats_map_t *__restrict map);
 
 void uvg_search_lcu(encoder_state_t *state, int x, int y, const yuv_t *hor_buf, const yuv_t *ver_buf, lcu_coeff_t *coeff);
 
-double uvg_cu_rd_cost_luma(const encoder_state_t *const state,
-                           const int x_px, const int y_px, const int depth,
-                           const cu_info_t *const pred_cu,
-                           lcu_t *const lcu,
-                           uint8_t isp_cbf);
-double uvg_cu_rd_cost_chroma(const encoder_state_t *const state,
-                             const int x_px, const int y_px, const int depth,
-                             cu_info_t *const pred_cu,
-                             lcu_t *const lcu);
+double uvg_cu_rd_cost_luma(
+  const encoder_state_t *const state,
+  const cu_loc_t* const cu_loc,
+  const cu_info_t *const pred_cu,
+  lcu_t *const lcu,
+  uint8_t isp_cbf);
+double uvg_cu_rd_cost_chroma(
+  const encoder_state_t *const state,
+  cu_info_t *const pred_cu,
+  lcu_t *const lcu,
+  const cu_loc_t * const);
 
-void uvg_lcu_fill_trdepth(lcu_t *lcu, int x_px, int y_px, int depth, uint8_t tr_depth, enum uvg_tree_type
-                          tree_type);
+void uvg_lcu_fill_trdepth(
+  lcu_t *lcu,
+  const cu_loc_t* const cu_loc,
+  uint8_t tr_depth,
+  enum uvg_tree_type
+  tree_type);
 
 void uvg_intra_recon_lcu_luma(encoder_state_t * const state, int x, int y, int depth, int8_t intra_mode, cu_info_t *cur_cu, lcu_t *lcu);
 void uvg_intra_recon_lcu_chroma(encoder_state_t * const state, int x, int y, int depth, int8_t intra_mode, cu_info_t *cur_cu, lcu_t *lcu);
diff --git a/src/search_inter.c b/src/search_inter.c
index 53587b84..46b04349 100644
--- a/src/search_inter.c
+++ b/src/search_inter.c
@@ -1811,7 +1811,7 @@ static void search_pu_inter(
         cur_pu->inter.mv[0][1]  = info->merge_cand[merge_idx].mv[0][1];
         cur_pu->inter.mv[1][0]  = info->merge_cand[merge_idx].mv[1][0];
         cur_pu->inter.mv[1][1]  = info->merge_cand[merge_idx].mv[1][1];
-        uvg_lcu_fill_trdepth(lcu, cu_loc->x, cu_loc->y, depth, MAX(1, depth), UVG_BOTH_T);
+        uvg_lcu_fill_trdepth(lcu, cu_loc, MAX(1, depth), UVG_BOTH_T);
         uvg_inter_recon_cu(state, lcu, true, false, cu_loc);
 
         uvg_quantize_lcu_residual(state, true, false, false, cu_loc, depth, cur_pu, lcu, true, UVG_BOTH_T);
@@ -2129,12 +2129,12 @@ void uvg_cu_cost_inter_rd2(
   const uint8_t depth = 6 - uvg_g_convert_to_log2[cu_loc->width];
   int tr_depth = MAX(1, depth);
 
-  uvg_lcu_fill_trdepth(lcu, cu_loc->x, cu_loc->y, depth, tr_depth, UVG_BOTH_T);
+  uvg_lcu_fill_trdepth(lcu, cu_loc, tr_depth, UVG_BOTH_T);
 
   const int x_px = SUB_SCU(cu_loc->x);
   const int y_px = SUB_SCU(cu_loc->y);
-  const int width = LCU_WIDTH >> depth;
-  const int height = width; // TODO: non-square blocks
+  const int width = cu_loc->width;
+  const int height = cu_loc->height;
 
   cabac_data_t cabac_copy;
   memcpy(&cabac_copy, &state->search_cabac, sizeof(cabac_copy));
@@ -2155,10 +2155,10 @@ void uvg_cu_cost_inter_rd2(
     int index = y_px / 2 * LCU_WIDTH_C + x_px / 2;
     double ssd_u = uvg_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index],
                                        LCU_WIDTH_C, LCU_WIDTH_C,
-                                       width / 2);
+                                       cu_loc->chroma_width);
     double ssd_v = uvg_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index],
                                        LCU_WIDTH_C, LCU_WIDTH_C,
-                                       width / 2);
+                                       cu_loc->chroma_width);
     ssd += (ssd_u + ssd_v) * UVG_CHROMA_MULT;
   }
   double no_cbf_bits;
@@ -2217,12 +2217,10 @@ void uvg_cu_cost_inter_rd2(
     uvg_chorma_ts_out_t chorma_ts_out;
     uvg_chroma_transform_search(
       state,
-      depth,
       lcu,
       &cabac_copy,
       cu_loc,
       index,
-      0,
       cur_cu,
       u_pred,
       v_pred,
@@ -2262,10 +2260,10 @@ void uvg_cu_cost_inter_rd2(
   int cbf = cbf_is_set_any(cur_cu->cbf, depth);
   
   if(cbf) {
-    *inter_cost = uvg_cu_rd_cost_luma(state, x_px, y_px, depth, cur_cu, lcu, 0);
+    *inter_cost = uvg_cu_rd_cost_luma(state, cu_loc, cur_cu, lcu, 0);
     if (reconstruct_chroma) {
       if (cur_cu->depth != cur_cu->tr_depth || !state->encoder_control->cfg.jccr) {
-        *inter_cost += uvg_cu_rd_cost_chroma(state, x_px, y_px, depth, cur_cu, lcu);
+        *inter_cost += uvg_cu_rd_cost_chroma(state, cur_cu, lcu, cu_loc);
       }
       else {
         *inter_cost += chroma_cost;
diff --git a/src/search_intra.c b/src/search_intra.c
index 10c6657d..6710b6fc 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -431,9 +431,7 @@ static double search_intra_trdepth(
         }
         double rd_cost = uvg_cu_rd_cost_luma(
           state,
-          lcu_px.x,
-          lcu_px.y,
-          depth,
+          cu_loc,
           pred_cu,
           lcu,
           search_data->best_isp_cbfs);
@@ -502,11 +500,9 @@ static double search_intra_trdepth(
           );
         best_rd_cost += uvg_cu_rd_cost_chroma(
           state,
-          lcu_px.x,
-          lcu_px.y,
-          depth,
           pred_cu,
-          lcu);
+          lcu,
+          cu_loc);
         pred_cu->intra.mode = luma_mode;
 
         // Check lfnst constraints for chroma
@@ -552,7 +548,7 @@ static double search_intra_trdepth(
                          UVG_BOTH_T,
                          false,
                          true);
-      best_rd_cost += uvg_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu);
+      best_rd_cost += uvg_cu_rd_cost_chroma(state, pred_cu, lcu, cu_loc);
       pred_cu->intra.mode = luma_mode;
     }
     pred_cu->tr_skip = best_tr_idx == MTS_SKIP;
@@ -655,7 +651,7 @@ static double search_intra_trdepth(
   if (depth == 0 || split_cost < nosplit_cost) {
     return split_cost;
   } else {
-    uvg_lcu_fill_trdepth(lcu, cu_loc->x, cu_loc->y, depth, depth, tree_type);
+    uvg_lcu_fill_trdepth(lcu, cu_loc, depth, tree_type);
 
     pred_cu->cbf = nosplit_cbf;
 
@@ -690,19 +686,15 @@ static void sort_modes(intra_search_data_t* __restrict modes, uint8_t length)
 
 static int search_intra_chroma_rough(
   encoder_state_t * const state,
-  int x_px,
-  int y_px,
-  int depth,
-  const vector2d_t* const lcu_px,
   intra_search_data_t* chroma_data,
   lcu_t* lcu,
   int8_t luma_mode,
-  enum uvg_tree_type tree_type)
+  enum uvg_tree_type tree_type,
+  const cu_loc_t* const cu_loc)
 {
-  assert(depth != 4 || (x_px & 4 && y_px & 4));
-  const int_fast8_t log2_width_c = MAX(LOG2_LCU_WIDTH - depth - 1, 2);
+  const int_fast8_t log2_width_c = uvg_g_convert_to_log2[cu_loc->chroma_width];
   const vector2d_t pic_px = { state->tile->frame->width, state->tile->frame->height };
-  const vector2d_t luma_px = { x_px & ~7, y_px & ~7 };
+  const vector2d_t luma_px = { cu_loc->x & ~7, cu_loc->y & ~7 };
   const int width = 1 << log2_width_c;
   const int height = width; // TODO: height for non-square blocks
 
@@ -714,7 +706,7 @@ static int search_intra_chroma_rough(
   uvg_intra_references refs_v;
   uvg_intra_build_reference(&loc, &loc, COLOR_V, &luma_px, &pic_px, lcu, &refs_v, state->encoder_control->cfg.wpp, NULL, 0, 0);
 
-  vector2d_t lcu_cpx = { (lcu_px->x & ~7) / 2, (lcu_px->y & ~7) / 2 };
+  vector2d_t lcu_cpx = { (cu_loc->local_x & ~7) / 2, (cu_loc->local_y & ~7) / 2 };
   uvg_pixel* orig_u = &lcu->ref.u[lcu_cpx.x + lcu_cpx.y * LCU_WIDTH_C];
   uvg_pixel* orig_v = &lcu->ref.v[lcu_cpx.x + lcu_cpx.y * LCU_WIDTH_C];
   
@@ -1494,29 +1486,19 @@ double uvg_chroma_mode_bits(const encoder_state_t *state, int8_t chroma_mode, in
 
 int8_t uvg_search_intra_chroma_rdo(
   encoder_state_t * const state,
-  int x_px,
-  int y_px,
-  int depth,
   int8_t num_modes,
   lcu_t *const lcu,
   intra_search_data_t* chroma_data,
   int8_t luma_mode,
-  enum uvg_tree_type tree_type)
+  enum uvg_tree_type tree_type,
+  const cu_loc_t* const cu_loc)
 {
-  const bool reconstruct_chroma = (depth != 4) || (x_px & 4 && y_px & 4);
-
-  const int luma_width  = LCU_WIDTH >> depth;
-  const int luma_height = LCU_WIDTH >> depth; // TODO: height
-
-  int log2_width = MAX(LOG2_LCU_WIDTH - depth - 1, 2);
+  const bool reconstruct_chroma = true;
   
-  cu_loc_t loc;
-  uvg_cu_loc_ctor(&loc, x_px & ~7, y_px & ~7, luma_width, luma_height);
-
-  const int chroma_width  = loc.chroma_width;
-  const int chroma_height = loc.chroma_height;
+  const int chroma_width  = cu_loc->chroma_width;
+  const int chroma_height = cu_loc->chroma_height;
   uvg_intra_references refs[2];
-  const vector2d_t luma_px = { x_px & ~7, y_px & ~7 };
+  const vector2d_t luma_px = { cu_loc->x & ~7, cu_loc->y & ~7 };
   const vector2d_t pic_px = {
     state->tile->frame->width,
     state->tile->frame->height,
@@ -1524,17 +1506,17 @@ int8_t uvg_search_intra_chroma_rdo(
 
 
   if (reconstruct_chroma) {
-    uvg_intra_build_reference(&loc, &loc, COLOR_U, &luma_px, &pic_px, lcu, &refs[0], state->encoder_control->cfg.wpp, NULL, 0, 0);
-    uvg_intra_build_reference(&loc, &loc, COLOR_V, &luma_px, &pic_px, lcu, &refs[1], state->encoder_control->cfg.wpp, NULL, 0, 0);
+    uvg_intra_build_reference(cu_loc, cu_loc, COLOR_U, &luma_px, &pic_px, lcu, &refs[0], state->encoder_control->cfg.wpp, NULL, 0, 0);
+    uvg_intra_build_reference(cu_loc, cu_loc, COLOR_V, &luma_px, &pic_px, lcu, &refs[1], state->encoder_control->cfg.wpp, NULL, 0, 0);
     
-    const vector2d_t lcu_px = { SUB_SCU(x_px), SUB_SCU(y_px) };
+    const vector2d_t lcu_px = { cu_loc->local_x, cu_loc->local_y };
     cabac_data_t temp_cabac;
     memcpy(&temp_cabac, &state->search_cabac, sizeof(cabac_data_t));
     
-    const int offset = ((lcu_px.x & ~7) >> 1) + ((lcu_px.y & ~7) >> 1)* LCU_WIDTH_C;
+    const int offset = ((cu_loc->local_x & ~7) >> 1) + ((cu_loc->local_y & ~7) >> 1)* LCU_WIDTH_C;
 
     int lfnst_modes_to_check[3];
-    if((depth == 4 || tree_type == UVG_CHROMA_T) && state->encoder_control->cfg.lfnst) {
+    if((cu_loc->width == 4 || tree_type == UVG_CHROMA_T) && state->encoder_control->cfg.lfnst) {
       for (int i = 0; i < 3; ++i) {
         lfnst_modes_to_check[i] = i;
       }
@@ -1572,7 +1554,7 @@ int8_t uvg_search_intra_chroma_rdo(
           uvg_intra_predict(
             state,
             &refs[COLOR_U - 1],
-            &loc,
+            cu_loc,
             COLOR_U,
             u_pred,
             &chroma_data[mode_i],
@@ -1581,7 +1563,7 @@ int8_t uvg_search_intra_chroma_rdo(
           uvg_intra_predict(
             state,
             &refs[COLOR_V - 1],
-            &loc,
+            cu_loc,
             COLOR_V,
             v_pred,
             &chroma_data[mode_i],
@@ -1606,12 +1588,10 @@ int8_t uvg_search_intra_chroma_rdo(
           uvg_chorma_ts_out_t chorma_ts_out;
           uvg_chroma_transform_search(
             state,
-            depth,
             lcu,
             &temp_cabac,
-            &loc,
+            cu_loc,
             offset,
-            mode,
             pred_cu,
             u_pred,
             v_pred,
@@ -1653,12 +1633,12 @@ int8_t uvg_search_intra_chroma_rdo(
           state->search_cabac.update = 1;
           chroma_data[mode_i].cost = mode_bits * state->lambda;
           uvg_intra_recon_cu(state,
-                             &chroma_data[mode_i], &loc,
+                             &chroma_data[mode_i], cu_loc,
                              pred_cu, lcu,
                              tree_type,
                              false,
                              true);
-          chroma_data[mode_i].cost += uvg_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu);
+          chroma_data[mode_i].cost += uvg_cu_rd_cost_chroma(state, pred_cu, lcu, cu_loc);
           memcpy(&state->search_cabac, &temp_cabac, sizeof(cabac_data_t));
         }
       }
@@ -1677,14 +1657,11 @@ int8_t uvg_search_intra_chroma_rdo(
 
 int8_t uvg_search_cu_intra_chroma(
   encoder_state_t * const state,
-  const int x_px,
-  const int y_px,
-  const int depth,
+  const cu_loc_t* const cu_loc,
   lcu_t *lcu,
   intra_search_data_t *search_data,
   enum uvg_tree_type tree_type)
 {
-  const vector2d_t lcu_px = { SUB_SCU(x_px), SUB_SCU(y_px) };
 
   const cu_info_t *cur_pu = &search_data->pred_cu;
   int8_t intra_mode = !cur_pu->intra.mip_flag ? cur_pu->intra.mode : 0;
@@ -1698,6 +1675,9 @@ int8_t uvg_search_cu_intra_chroma(
     }
   }
 
+  cu_loc_t chroma_loc;
+  uvg_cu_loc_ctor(&chroma_loc, cu_loc->x & ~7, cu_loc->y & ~7, cu_loc->width, cu_loc->height);
+
   // The number of modes to select for slower chroma search. Luma mode
   // is always one of the modes, so 2 means the final decision is made
   // between luma mode and one other mode that looks the best
@@ -1715,7 +1695,7 @@ int8_t uvg_search_cu_intra_chroma(
     chroma_data[i].pred_cu = *cur_pu;
     chroma_data[i].pred_cu.intra.mode_chroma = num_modes == 1 ? intra_mode : modes[i];
     chroma_data[i].cost = 0;
-    if(depth != 4 && tree_type == UVG_BOTH_T) {
+    if(cu_loc->width != 4 && tree_type == UVG_BOTH_T) {
       memcpy(chroma_data[i].lfnst_costs, search_data->lfnst_costs, sizeof(double) * 3);
     }
   }
@@ -1726,16 +1706,13 @@ int8_t uvg_search_cu_intra_chroma(
   if(state->encoder_control->cfg.cclm && 0){
     
 
-    num_modes = search_intra_chroma_rough(state, x_px, y_px, depth,
-                                          &lcu_px,
-                                          chroma_data,
-                                          lcu,
-                                          intra_mode,
-                                          tree_type);
+    num_modes = search_intra_chroma_rough(state, chroma_data, lcu, intra_mode,
+                                          tree_type,
+                                          &chroma_loc);
   }
   
   if (num_modes > 1 || state->encoder_control->cfg.jccr) {
-    uvg_search_intra_chroma_rdo(state, x_px, y_px, depth, num_modes, lcu, chroma_data, intra_mode, tree_type);
+    uvg_search_intra_chroma_rdo(state, num_modes, lcu, chroma_data, intra_mode, tree_type, &chroma_loc);
   }
   else if(cur_pu->lfnst_idx) {
     chroma_data[0].pred_cu.cr_lfnst_idx = cur_pu->lfnst_idx;
@@ -1983,7 +1960,7 @@ void uvg_search_cu_intra(
   // Set transform depth to current depth, meaning no transform splits.
   {
     const int8_t depth = 6 - uvg_g_convert_to_log2[cu_loc->width];
-    uvg_lcu_fill_trdepth(lcu, cu_loc->x, cu_loc->y, depth, depth, tree_type);
+    uvg_lcu_fill_trdepth(lcu, cu_loc, depth, tree_type);
   }
   // Refine results with slower search or get some results if rough search was skipped.
   const int32_t rdo_level = state->encoder_control->cfg.rdo;
diff --git a/src/search_intra.h b/src/search_intra.h
index e9264275..faa26ff1 100644
--- a/src/search_intra.h
+++ b/src/search_intra.h
@@ -52,9 +52,7 @@ double uvg_chroma_mode_bits(const encoder_state_t *state,
 
 int8_t uvg_search_cu_intra_chroma(
   encoder_state_t * const state,
-  const int x_px,
-  const int y_px,
-  const int depth,
+  const cu_loc_t* const cu_loc,
   lcu_t *lcu,
   intra_search_data_t* best_cclm,
   enum uvg_tree_type tree_type);
diff --git a/src/strategies/avx2/quant-avx2.c b/src/strategies/avx2/quant-avx2.c
index bc70daab..2fc27872 100644
--- a/src/strategies/avx2/quant-avx2.c
+++ b/src/strategies/avx2/quant-avx2.c
@@ -709,7 +709,7 @@ int uvg_quantize_residual_avx2(encoder_state_t *const state,
   {
     int8_t tr_depth = cur_cu->tr_depth - cur_cu->depth;
     uvg_rdoq(state, coeff, coeff_out, width, height, color,
-      scan_order, cur_cu->type, tr_depth, cur_cu->cbf, lfnst_index);
+      scan_order, cur_cu->type, cur_cu->cbf, lfnst_index);
   }
   else if (state->encoder_control->cfg.rdoq_enable && use_trskip) {
     uvg_ts_rdoq(state, coeff, coeff_out, width, height, color,
diff --git a/src/strategies/generic/encode_coding_tree-generic.c b/src/strategies/generic/encode_coding_tree-generic.c
index 8d9ca61d..48a5cc3d 100644
--- a/src/strategies/generic/encode_coding_tree-generic.c
+++ b/src/strategies/generic/encode_coding_tree-generic.c
@@ -54,7 +54,7 @@
 void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
   cabac_data_t * const cabac,
   const coeff_t *coeff,
-  const cu_loc_t *cu_loc,
+  const cu_loc_t * const cu_loc,
   uint8_t color,
   int8_t scan_mode,
   cu_info_t* cur_cu,
@@ -80,8 +80,8 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
 
   // CONSTANTS
 
-  const uint32_t log2_block_width =  uvg_g_convert_to_log2[width];
-  const uint32_t log2_block_height = uvg_g_convert_to_log2[height];
+  const uint8_t log2_block_width =  uvg_g_convert_to_log2[width];
+  const uint8_t log2_block_height = uvg_g_convert_to_log2[height];
   
   const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_width][log2_block_height][0] + uvg_g_log2_sbb_size[log2_block_width][log2_block_height][1];
   const uint32_t* const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_mode, log2_block_width, log2_block_height);
diff --git a/src/strategies/generic/encode_coding_tree-generic.h b/src/strategies/generic/encode_coding_tree-generic.h
index 26682a61..0de02e3c 100644
--- a/src/strategies/generic/encode_coding_tree-generic.h
+++ b/src/strategies/generic/encode_coding_tree-generic.h
@@ -44,7 +44,7 @@
 void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
                                   cabac_data_t * const cabac,
                                   const coeff_t *coeff,
-                                  const cu_loc_t *loc,
+                                  const cu_loc_t * const loc,
                                   uint8_t color,
                                   int8_t scan_mode,
                                   cu_info_t* cur_cu,
diff --git a/src/strategies/generic/quant-generic.c b/src/strategies/generic/quant-generic.c
index be396a8b..bfb92700 100644
--- a/src/strategies/generic/quant-generic.c
+++ b/src/strategies/generic/quant-generic.c
@@ -317,8 +317,7 @@ int uvg_quant_cbcr_residual_generic(
   {
     int8_t tr_depth = cur_cu->tr_depth - cur_cu->depth;
     uvg_rdoq(state, coeff, coeff_out, width, width, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U,
-             scan_order, cur_cu->type, tr_depth, cur_cu->cbf,
-      cur_cu->cr_lfnst_idx);
+             scan_order, cur_cu->type, cur_cu->cbf, cur_cu->cr_lfnst_idx);
   }
   else if (state->encoder_control->cfg.rdoq_enable && false) {
     uvg_ts_rdoq(state, coeff, coeff_out, width, width, cur_cu->joint_cb_cr == 2 ? COLOR_V : COLOR_U,
@@ -499,8 +498,7 @@ int uvg_quantize_residual_generic(encoder_state_t *const state,
   {
     int8_t tr_depth = cur_cu->tr_depth - cur_cu->depth;
     uvg_rdoq(state, coeff, coeff_out, width, height, color,
-             scan_order, cur_cu->type, tr_depth, cur_cu->cbf,
-      lfnst_index);
+             scan_order, cur_cu->type, cur_cu->cbf, lfnst_index);
   } else if(state->encoder_control->cfg.rdoq_enable && use_trskip) {
     uvg_ts_rdoq(state, coeff, coeff_out, width, height, color,
       scan_order);
diff --git a/src/strategies/strategies-encode.h b/src/strategies/strategies-encode.h
index 625f4005..969dfb57 100644
--- a/src/strategies/strategies-encode.h
+++ b/src/strategies/strategies-encode.h
@@ -49,7 +49,7 @@
 typedef unsigned (encode_coeff_nxn_func)(encoder_state_t * const state,
                                          cabac_data_t * const cabac,
                                          const coeff_t *coeff,
-                                         const cu_loc_t *loc,
+                                         const cu_loc_t * const loc,
                                          uint8_t color,
                                          int8_t scan_mode,
                                          cu_info_t* cur_cu,
diff --git a/src/transform.c b/src/transform.c
index 84eb3558..86ff515b 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -434,8 +434,7 @@ static void quantize_chroma(
     (transforms[i] != CHROMA_TS || !state->encoder_control->cfg.rdoq_skip))
   {
     uvg_rdoq(state, &u_coeff[i * trans_offset], u_quant_coeff, width, height, transforms[i] != JCCR_1 ? COLOR_U : COLOR_V,
-             scan_order, CU_INTRA, depth, 0,
-             lfnst_idx);
+             scan_order, CU_INTRA, 0, lfnst_idx);
 
     int j;
     for (j = 0; j < width * height; ++j) {
@@ -449,8 +448,7 @@ static void quantize_chroma(
       uint16_t temp_cbf = 0;
       if (*u_has_coeffs)cbf_set(&temp_cbf, depth, COLOR_U);
       uvg_rdoq(state, &v_coeff[i * trans_offset], v_quant_coeff, width, height, COLOR_V,
-               scan_order, CU_INTRA, depth, temp_cbf,
-               lfnst_idx);
+               scan_order, CU_INTRA, temp_cbf, lfnst_idx);
 
     }
   }
@@ -486,12 +484,10 @@ static void quantize_chroma(
 
 void uvg_chroma_transform_search(
   encoder_state_t* const state,
-  int depth,
   lcu_t* const lcu,
   cabac_data_t* temp_cabac,
   const cu_loc_t* const cu_loc,
   const int offset,
-  const uint8_t mode,
   cu_info_t* pred_cu,
   uvg_pixel u_pred[1024],
   uvg_pixel v_pred[1024],
@@ -507,6 +503,8 @@ void uvg_chroma_transform_search(
   const int width  = cu_loc->chroma_width;
   const int height = cu_loc->chroma_height;
 
+  const int depth = 6 - uvg_g_convert_to_log2[cu_loc->width];
+
   uvg_transform2d(
     state->encoder_control, u_resi, u_coeff, width, height, COLOR_U, pred_cu
   );
@@ -553,8 +551,6 @@ void uvg_chroma_transform_search(
     coeff_t v_quant_coeff[LCU_WIDTH_C * LCU_WIDTH_C];
     int16_t u_recon_resi[LCU_WIDTH_C * LCU_WIDTH_C];
     int16_t v_recon_resi[LCU_WIDTH_C * LCU_WIDTH_C];
-    const coeff_scan_order_t scan_order =
-      uvg_get_scan_order(pred_cu->type, mode, depth);
     bool u_has_coeffs = false;
     bool v_has_coeffs = false;
     if(pred_cu->cr_lfnst_idx) {
@@ -575,13 +571,13 @@ void uvg_chroma_transform_search(
       i,
       u_quant_coeff,
       v_quant_coeff,
-      scan_order,
+      SCAN_DIAG,
       &u_has_coeffs,
       &v_has_coeffs,
       pred_cu->cr_lfnst_idx);
       if(pred_cu->cr_lfnst_idx !=0 && !u_has_coeffs && !v_has_coeffs) continue;
     
-    if(pred_cu->type == CU_INTRA && transforms[i] != CHROMA_TS && (depth == 4 || tree_type == UVG_CHROMA_T)) {
+    if(pred_cu->type == CU_INTRA && transforms[i] != CHROMA_TS && (cu_loc->width == 4 || tree_type == UVG_CHROMA_T)) {
       bool constraints[2] = { false, false };
       uvg_derive_lfnst_constraints(pred_cu, constraints, u_quant_coeff, width, height, NULL, COLOR_U);
       if(!IS_JCCR_MODE(transforms[i])) {
@@ -593,9 +589,9 @@ void uvg_chroma_transform_search(
     if (IS_JCCR_MODE(transforms[i]) && !u_has_coeffs) continue;
 
     if (u_has_coeffs) {
-
       uvg_dequant(state, u_quant_coeff, &u_coeff[i * trans_offset], width, width, transforms[i] != JCCR_1 ? COLOR_U : COLOR_V,
         pred_cu->type, transforms[i] == CHROMA_TS);
+
       if (transforms[i] != CHROMA_TS) {
         if (pred_cu->cr_lfnst_idx) {
           uvg_inv_lfnst(pred_cu, width, height, COLOR_U, pred_cu->cr_lfnst_idx, &u_coeff[i * trans_offset], tree_type);
@@ -606,6 +602,7 @@ void uvg_chroma_transform_search(
       else {
         uvg_itransformskip(state->encoder_control, u_recon_resi, &u_coeff[i * trans_offset], width, height);
       }
+
       if (transforms[i] != JCCR_1) {
         for (int j = 0; j < width * height; j++) {
           u_recon[trans_offset * i + j] = CLIP_TO_PIXEL((uvg_pixel)(u_pred[j] + u_recon_resi[j]));
@@ -620,9 +617,12 @@ void uvg_chroma_transform_search(
     else {
       uvg_pixels_blit(u_pred, &u_recon[trans_offset * i], width, height, width, width);
     }
+
+
     if (v_has_coeffs && !(IS_JCCR_MODE(transforms[i]))) {
       uvg_dequant(state, v_quant_coeff, &v_coeff[i * trans_offset], width, width, COLOR_V,
         pred_cu->type, transforms[i] == CHROMA_TS);
+
       if (transforms[i] != CHROMA_TS) {
         if (pred_cu->cr_lfnst_idx) {
           uvg_inv_lfnst(pred_cu, width, height, COLOR_V, pred_cu->cr_lfnst_idx, &v_coeff[i * trans_offset], tree_type);
@@ -633,6 +633,7 @@ void uvg_chroma_transform_search(
       else {
         uvg_itransformskip(state->encoder_control, v_recon_resi, &v_coeff[i * trans_offset], width, height);
       }
+
       for (int j = 0; j < width * height; j++) {
         v_recon[trans_offset * i + j] = CLIP_TO_PIXEL(v_pred[j] + v_recon_resi[j]);
       }
@@ -700,7 +701,7 @@ void uvg_chroma_transform_search(
         pred_cu,
         cu_loc,
         COLOR_U,
-        scan_order,
+        SCAN_DIAG,
         transforms[i] == CHROMA_TS,
         COEFF_ORDER_LINEAR);
       u_bits += coeff_cost;
@@ -717,7 +718,7 @@ void uvg_chroma_transform_search(
         pred_cu,
         cu_loc,
         COLOR_V,
-        scan_order,
+        SCAN_DIAG,
         transforms[i] == CHROMA_TS,
         COEFF_ORDER_LINEAR);
     }
diff --git a/src/transform.h b/src/transform.h
index 4d5e8ba8..50a3f7de 100644
--- a/src/transform.h
+++ b/src/transform.h
@@ -104,12 +104,10 @@ void uvg_quantize_lcu_residual(
 
 void uvg_chroma_transform_search(
   encoder_state_t* const state,
-  int depth,
   lcu_t* const lcu,
   cabac_data_t* temp_cabac,
   const cu_loc_t* const cu_loc,
   const int offset,
-  const uint8_t mode,
   cu_info_t* pred_cu,
   uvg_pixel u_pred[1024],
   uvg_pixel v_pred[1024],
diff --git a/tests/test_cabac_state.sh b/tests/test_cabac_state.sh
index 519f9c40..e60806dc 100755
--- a/tests/test_cabac_state.sh
+++ b/tests/test_cabac_state.sh
@@ -6,10 +6,10 @@ set -eu
 
 cabacfile="$(mktemp)"
 
-valgrind_test 256x128 10 yuv420p --preset veryslow --rd 3 --mip --jccr --mrl --lfnst -p 1 --owf 0 --no-wpp --cabac-debug-file="${cabacfile}"
+valgrind_test 256x128 10 yuv420p --preset veryslow --pu-depth-intra 0-4 --cclm --rd 3 --mip --jccr --mrl --lfnst -p 1 --owf 0 --no-wpp --cabac-debug-file="${cabacfile}"
 python3 check_cabac_state_consistency.py "${cabacfile}"
 
-valgrind_test 256x128 10 yuv420p --preset veryslow --rd 3 --mip --jccr --mrl --lfnst --dual-tree -p 1 --owf 0 --no-wpp --cabac-debug-file="${cabacfile}"
+valgrind_test 256x128 10 yuv420p --preset veryslow --pu-depth-intra 0-4 --cclm --rd 3 --mip --jccr --mrl --lfnst --dual-tree -p 1 --owf 0 --no-wpp --cabac-debug-file="${cabacfile}"
 python3 check_cabac_state_consistency.py "${cabacfile}"
 
 rm -rf "${cabacfile}"

From 790b1fad487ae96b003545c7109eec867858383d Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Tue, 13 Sep 2022 11:20:25 +0300
Subject: [PATCH 093/254] wip

---
 src/cu.h                 |  8 ++++---
 src/encode_coding_tree.c | 29 ++++++++++++++--------
 src/encoderstate.c       |  2 +-
 src/search.c             | 52 +++++++++++++++++++---------------------
 4 files changed, 49 insertions(+), 42 deletions(-)

diff --git a/src/cu.h b/src/cu.h
index dfad7861..c954ce9b 100644
--- a/src/cu.h
+++ b/src/cu.h
@@ -119,7 +119,7 @@ typedef struct
 {
   uint8_t type        : 3; //!< \brief block type, one of cu_type_t values
   uint8_t depth       : 3; //!< \brief depth / size of this block
-  uint8_t tr_depth    ; //!< \brief transform depth
+  uint8_t tr_depth    : 3; //!< \brief transform depth
   uint8_t skipped     : 1; //!< \brief flag to indicate this block is skipped
   uint8_t merged      : 1; //!< \brief flag to indicate this block is merged
   uint8_t merge_idx   : 3; //!< \brief merge index
@@ -127,6 +127,9 @@ typedef struct
   uint8_t tr_idx      : 3; //!< \brief transform index
   uint8_t joint_cb_cr : 3; //!< \brief joint chroma residual coding 
 
+  uint8_t log2_width : 3;
+  uint8_t log2_height : 3;
+
   uint16_t cbf;
 
   uint32_t split_tree : 3 * 9;
@@ -594,7 +597,6 @@ static INLINE void cbf_copy(uint16_t *cbf, uint16_t src, color_t plane)
   *cbf |= src & (cbf_masks[0] << (NUM_CBF_DEPTHS * plane));
 }
 
-#define GET_SPLITDATA(CU,curDepth) ((CU)->depth > curDepth)
-#define SET_SPLITDATA(CU,flag) { (CU)->split=(flag); }
+#define GET_SPLITDATA(CU,curDepth) (((CU)->split_tree >> (curDepth)) & 7)
 
 #endif
diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index 8c19df4b..931df0db 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -654,8 +654,6 @@ static void encode_transform_coeff(
   const cu_loc_t * cu_loc,
   int8_t depth,
   int8_t tr_depth,
-  uint8_t parent_coeff_u,
-  uint8_t parent_coeff_v,
   bool only_chroma,
   lcu_coeff_t* coeff,
   enum uvg_tree_type tree_type,
@@ -726,13 +724,27 @@ static void encode_transform_coeff(
   }
   */
 
+  if (split) {
+    int split_width  = width >> 1;
+    int split_height = height >> 1;
+
+    for (int j = 0; j < 2; j++) {
+      for (int i = 0; i < 2; i++) {
+        cu_loc_t loc;
+        uvg_cu_loc_ctor(&loc, (x + i * split_width), (y + j * split_height), width >> 1, height >> 1);
+
+        encode_transform_coeff(state, &loc, depth + 1, tr_depth + 1, only_chroma, coeff, tree_type, true, luma_cbf_ctx, &loc);
+      }
+    }
+    return;
+  }
   // Chroma cb flags are not signaled when one of the following:
   // - transform size is 4 (2x2 chroma transform doesn't exist)
   // - they have already been signaled to 0 previously
   // When they are not present they are inferred to be 0, except for size 4
   // when the flags from previous level are used.
   if (state->encoder_control->chroma_format != UVG_CSP_400 && (depth != 4 || only_chroma) && tree_type != UVG_LUMA_T && last_split) {
-    
+
     if (!split) {
       if (true) {
         assert(tr_depth < 5);
@@ -741,14 +753,11 @@ static void encode_transform_coeff(
       }
       if (true) {
         cabac->cur_ctx = &(cabac->ctx.qt_cbf_model_cr[cb_flag_u ? 1 : 0]);
-        CABAC_BIN(cabac,  cb_flag_v, "cbf_cr");
+        CABAC_BIN(cabac, cb_flag_v, "cbf_cr");
       }
     }
   }
 
-  if (split) {
-    int split_width  = width >> 1;
-    int split_height = height >> 1;
 
     for (int j = 0; j < 2; j++) {
       for (int i = 0; i < 2; i++) {
@@ -1672,7 +1681,7 @@ void uvg_encode_coding_tree(
       // Code (possible) coeffs to bitstream
       if (cbf) {
         int luma_cbf_ctx = 0;
-        encode_transform_coeff(state, &cu_loc, depth, 0, 0, 0, 0, coeff, tree_type, true, false, &luma_cbf_ctx, cu_loc);
+        encode_transform_coeff(state, cu_loc, depth, 0, 0, 0, 0, coeff, tree_type, true, false, &luma_cbf_ctx, cu_loc);
       }
 
       encode_mts_idx(state, cabac, cur_cu, cu_loc);
@@ -1704,7 +1713,7 @@ void uvg_encode_coding_tree(
 
         // Check if last split to write chroma
         bool last_split = (i + 1) == split_limit;
-        encode_transform_coeff(state, &split_loc, depth, 0, 0, 0, 0, coeff, tree_type, last_split, can_skip_last_cbf, &luma_cbf_ctx, cu_loc);
+        encode_transform_coeff(state, &split_loc, depth, 0, 0, coeff, tree_type, last_split, can_skip_last_cbf, &luma_cbf_ctx, cu_loc);
         can_skip_last_cbf &= luma_cbf_ctx == 2;
       }
     }
@@ -1724,7 +1733,7 @@ void uvg_encode_coding_tree(
       tmp->violates_lfnst_constrained_luma = false;
       tmp->violates_lfnst_constrained_chroma = false;
       tmp->lfnst_last_scan_pos = false;
-      encode_transform_coeff(state, &cu_loc, depth, 0, 0, 0, 1, coeff, tree_type, true, false, &luma_cbf_ctx, cu_loc);
+      encode_transform_coeff(state, &cu_loc, depth, 0, 1, coeff, tree_type, true, false, &luma_cbf_ctx, cu_loc);
       // Write LFNST only once for single tree structure
       encode_lfnst_idx(state, cabac, tmp, x, y, depth, cu_width, cu_height, tree_type, COLOR_UV);
     }
diff --git a/src/encoderstate.c b/src/encoderstate.c
index 593ffd16..a42ce424 100644
--- a/src/encoderstate.c
+++ b/src/encoderstate.c
@@ -635,7 +635,7 @@ static void set_cu_qps(encoder_state_t *state, const cu_loc_t* const cu_loc, int
   if (cu_loc->x >= state->tile->frame->width || cu_loc->y >= state->tile->frame->height) return;
 
   cu_info_t *cu = uvg_cu_array_at(state->tile->frame->cu_array, cu_loc->x, cu_loc->y);
-  const int width = LCU_WIDTH >> cu->depth;
+  const int width = 1 << cu->log2_width;
 
   if (depth <= state->frame->max_qp_delta_depth) {
     *prev_qp = -1;
diff --git a/src/search.c b/src/search.c
index 11d934b9..2874248e 100644
--- a/src/search.c
+++ b/src/search.c
@@ -455,23 +455,6 @@ double uvg_cu_rd_cost_chroma(
   int u_is_set = pred_cu->joint_cb_cr ? (pred_cu->joint_cb_cr & 2) >> 1 : cbf_is_set(pred_cu->cbf, depth, COLOR_U);
   int v_is_set = pred_cu->joint_cb_cr ? (pred_cu->joint_cb_cr & 1) : cbf_is_set(pred_cu->cbf, depth, COLOR_V);
 
-
-  // See luma for why the second condition
-  if (!skip_residual_coding) {
-    const int tr_depth = depth - pred_cu->depth;
-    cabac_data_t* cabac = (cabac_data_t*)&state->search_cabac;
-    cabac_ctx_t *ctx = &(cabac->ctx.qt_cbf_model_cb[0]);
-    cabac->cur_ctx = ctx;
-    if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_U)) {
-      CABAC_FBITS_UPDATE(cabac, ctx, u_is_set, tr_tree_bits, "cbf_cb_search");
-    }
-    ctx = &(cabac->ctx.qt_cbf_model_cr[u_is_set]);
-    if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_V)) {
-      CABAC_FBITS_UPDATE(cabac, ctx, v_is_set, tr_tree_bits, "cbf_cb_search");
-    }
-  }
-
-
   if (cu_loc->width > TR_MAX_WIDTH || cu_loc->height > TR_MAX_WIDTH) {
     double sum = 0;
     // Recursively process sub-CUs.
@@ -490,6 +473,22 @@ double uvg_cu_rd_cost_chroma(
 
     return sum + tr_tree_bits * state->lambda;
   }
+  
+  if (!skip_residual_coding) {
+    const int tr_depth = depth - pred_cu->depth;
+    cabac_data_t* cabac = (cabac_data_t*)&state->search_cabac;
+    cabac_ctx_t* ctx = &(cabac->ctx.qt_cbf_model_cb[0]);
+    cabac->cur_ctx = ctx;
+    if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_U)) {
+      CABAC_FBITS_UPDATE(cabac, ctx, u_is_set, tr_tree_bits, "cbf_cb_search");
+    }
+    ctx = &(cabac->ctx.qt_cbf_model_cr[u_is_set]);
+    if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_V)) {
+      CABAC_FBITS_UPDATE(cabac, ctx, v_is_set, tr_tree_bits, "cbf_cb_search");
+    }
+  }
+
+
 
   if (state->encoder_control->cfg.jccr) {
     int cbf_mask = u_is_set * 2 + v_is_set - 1;
@@ -570,17 +569,7 @@ static double cu_rd_cost_tr_split_accurate(
     }
 
   }
-
-  bool has_chroma = state->encoder_control->chroma_format != UVG_CSP_400 && (depth != 4 || (cu_loc->x % 8 && cu_loc->y % 8)) && tree_type != UVG_LUMA_T;
-  if( !skip_residual_coding && has_chroma) {
-    if(tr_cu->tr_depth == depth) {
-      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_cbf_model_cb[0]), cb_flag_u, tr_tree_bits, "cbf_cb");
-    } 
-    if(tr_cu->tr_depth == depth) {
-      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_cbf_model_cr[cb_flag_u]), cb_flag_v, tr_tree_bits, "cbf_cr");
-    } 
-  }
-
+  
   if (cu_loc->width > TR_MAX_WIDTH || cu_loc->height > TR_MAX_WIDTH) {
     double sum = 0;
 
@@ -597,6 +586,13 @@ static double cu_rd_cost_tr_split_accurate(
     sum += cu_rd_cost_tr_split_accurate(state, pred_cu, lcu, tree_type, isp_cbf, &split_cu_loc);
     return sum + tr_tree_bits * state->lambda;
   }
+
+  bool has_chroma = state->encoder_control->chroma_format != UVG_CSP_400 && (depth != 4 || (cu_loc->x % 8 && cu_loc->y % 8)) && tree_type != UVG_LUMA_T;
+  if (!skip_residual_coding && has_chroma) {
+    CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_cbf_model_cb[0]), cb_flag_u, tr_tree_bits, "cbf_cb");  
+    CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_cbf_model_cr[cb_flag_u]), cb_flag_v, tr_tree_bits, "cbf_cr");    
+  }
+
   const int cb_flag_y = cbf_is_set(tr_cu->cbf, depth, COLOR_Y) && tree_type != UVG_CHROMA_T;
 
   const bool is_isp = !(pred_cu->type == CU_INTER || pred_cu->intra.isp_mode == ISP_MODE_NO_ISP);

From 0b6f666a1b33390a5ce7f7caac07ccefd59eb176 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Tue, 13 Sep 2022 14:38:19 +0300
Subject: [PATCH 094/254] [mtt] remove lfnst dependency to depth

---
 src/cu.h                 |  1 +
 src/encode_coding_tree.c | 94 ++++++++--------------------------------
 src/encode_coding_tree.h |  6 +--
 src/search.c             | 16 ++++---
 src/search_intra.c       | 80 +++++-----------------------------
 src/transform.c          |  2 +-
 6 files changed, 44 insertions(+), 155 deletions(-)

diff --git a/src/cu.h b/src/cu.h
index c954ce9b..87d31f33 100644
--- a/src/cu.h
+++ b/src/cu.h
@@ -598,5 +598,6 @@ static INLINE void cbf_copy(uint16_t *cbf, uint16_t src, color_t plane)
 }
 
 #define GET_SPLITDATA(CU,curDepth) (((CU)->split_tree >> (curDepth)) & 7)
+#define PU_IS_TU(cu) ((cu)->log2_width <= TR_MAX_LOG2_SIZE && (cu)->log2_height <= TR_MAX_LOG2_SIZE)
 
 #endif
diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index 931df0db..925d80da 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -109,86 +109,32 @@ static void encode_mts_idx(
 bool uvg_is_lfnst_allowed(
   const encoder_state_t* const state,
   const cu_info_t* const pred_cu,
-  const int width,
-  const int height,
-  const int x,
-  const int y,
   enum uvg_tree_type tree_type,
   const color_t color,
-  const lcu_t* lcu) 
+  const cu_loc_t* const cu_loc) 
 {
-  if (state->encoder_control->cfg.lfnst && pred_cu->type == CU_INTRA && pred_cu->depth == pred_cu->tr_depth) {
+  if (state->encoder_control->cfg.lfnst && pred_cu->type == CU_INTRA && PU_IS_TU(pred_cu)) {
     const int isp_mode = pred_cu->intra.isp_mode;
-    const int depth = pred_cu->depth;
-    const int chroma_width = width >> 1;
-    const int chroma_height = height >> 1;
-    const int cu_width = tree_type != UVG_LUMA_T || depth == 4 ? width : chroma_width;
-    const int cu_height = tree_type != UVG_LUMA_T || depth == 4 ? height : chroma_height;
-    bool can_use_lfnst_with_mip = (width >= 16 && height >= 16);
-    bool is_sep_tree = depth == 4 || tree_type != UVG_BOTH_T;
+    const int cu_width = tree_type != UVG_CHROMA_T ? cu_loc->width : cu_loc->chroma_width;
+    const int cu_height = tree_type != UVG_CHROMA_T ? cu_loc->height : cu_loc->chroma_height;
+    bool can_use_lfnst_with_mip = (cu_width >= 16 && cu_height >= 16);
+    bool is_sep_tree = tree_type != UVG_BOTH_T;
     bool mip_flag = pred_cu->type == CU_INTRA && color == COLOR_Y ? pred_cu->intra.mip_flag : false;
 
-    if ((isp_mode && !uvg_can_use_isp_with_lfnst(width, height, isp_mode, tree_type)) ||
+    if ((isp_mode && !uvg_can_use_isp_with_lfnst(cu_width, cu_height, isp_mode, tree_type)) ||
       (pred_cu->type == CU_INTRA && mip_flag && !can_use_lfnst_with_mip) || 
       (is_sep_tree && MIN(cu_width, cu_height) < 4) || 
       (cu_width > TR_MAX_WIDTH || cu_height > TR_MAX_WIDTH)) {
       return false;
     }
-    bool luma_flag = (depth == 4 && color == COLOR_Y) || (tree_type != UVG_CHROMA_T && depth != 4);
-    bool chroma_flag = (depth == 4 && color != COLOR_Y) || tree_type != UVG_LUMA_T;
+    bool luma_flag = tree_type != UVG_CHROMA_T;
+    bool chroma_flag = tree_type != UVG_LUMA_T;
     bool non_zero_coeff_non_ts_corner_8x8 = false;
     bool last_scan_pos = false;
     bool is_tr_skip = false;
-
-    int split_num = color == COLOR_Y && isp_mode ? uvg_get_isp_split_num(width, height, isp_mode, false) : 0;
-    const videoframe_t* const frame = state->tile->frame;
     
-    if (split_num) {
-      // Constraints for ISP split blocks
-      for (int i = 0; i < split_num; ++i) {
-        cu_loc_t split_loc;
-        uvg_get_isp_split_loc(&split_loc, x, y, width, height, i, isp_mode, false);
-        int local_split_x = split_loc.x;
-        int local_split_y = split_loc.y;
-        uvg_get_isp_cu_arr_coords(&local_split_x, &local_split_y);
-        const cu_info_t* split_cu = lcu ? LCU_GET_CU_AT_PX(lcu, local_split_x, local_split_y) : 
-                                    uvg_cu_array_at_const(frame->cu_array, local_split_x, local_split_y);
-
-        //if (cbf_is_set(split_cu->cbf, depth, COLOR_Y)) {
-        // ISP_TODO: remove this if clause altogether if it seems it is not needed
-        if (true) {
-          non_zero_coeff_non_ts_corner_8x8 |= (luma_flag && split_cu->violates_lfnst_constrained_luma) || (chroma_flag && split_cu->violates_lfnst_constrained_chroma);
-          //last_scan_pos |= split_cu->lfnst_last_scan_pos;
-          last_scan_pos |= true;
-        }
-      }
-    } else {
-      non_zero_coeff_non_ts_corner_8x8 |= (luma_flag && pred_cu->violates_lfnst_constrained_luma) || (chroma_flag && pred_cu->violates_lfnst_constrained_chroma);
-      last_scan_pos |= pred_cu->lfnst_last_scan_pos;
-    }
-
-    //const int num_pred_units = kvz_part_mode_num_parts[pred_cu->part_size];
-    const int tr_depth = pred_cu->tr_depth;
-    assert(depth <= tr_depth && "Depth greater than transform depth. This should never trigger.");
-    const int num_transform_units = 1 << (2 * (tr_depth - depth));
-    const int tu_row_length = 1 << (tr_depth - depth);
-    const int tu_width = cu_width >> (tr_depth - depth);
-    const int tu_height = tu_width; // TODO: height for non-square blocks
-
-    // TODO: chroma transform skip
-    if (color == COLOR_Y) {
-      for (int i = 0; i < num_transform_units; i++) {
-        // TODO: this works only for square blocks
-        const int tu_x = x + ((i % tu_row_length) * tu_width);
-        const int tu_y = y + ((i / tu_row_length) * tu_height);
-        const cu_info_t* cur_tu = lcu ? LCU_GET_CU_AT_PX(lcu, tu_x, tu_y) : uvg_cu_array_at_const(frame->cu_array, tu_x, tu_y);
-        assert(cur_tu != NULL && "NULL transform unit.");
-        bool cbf_set = cbf_is_set(cur_tu->cbf, tr_depth, COLOR_Y);
-
-        if (cur_tu != NULL && cbf_set && cur_tu->tr_idx == MTS_SKIP) {
-          is_tr_skip = true;
-        }
-      }
+    if (color == COLOR_Y && pred_cu->tr_idx == MTS_SKIP) {
+      is_tr_skip = true;
     }
 
     if ((!pred_cu->lfnst_last_scan_pos && !isp_mode) || non_zero_coeff_non_ts_corner_8x8 || is_tr_skip) {
@@ -205,19 +151,15 @@ static bool encode_lfnst_idx(
   const encoder_state_t* const state,
   cabac_data_t * const cabac,
   const cu_info_t * const pred_cu,
-  const int x,
-  const int y,
-  const int depth,
-  const int width,
-  const int height,
   enum uvg_tree_type tree_type,
-  const color_t color)
+  const color_t color,
+  const cu_loc_t* const cu_loc)
 {
   
-  if (uvg_is_lfnst_allowed(state, pred_cu, width, height, x, y, tree_type, color, NULL)) {
+  if (uvg_is_lfnst_allowed(state, pred_cu, tree_type, color, cu_loc)) {
     // Getting separate tree bool from block size is a temporary fix until a proper dual tree check is possible (there is no dual tree structure at time of writing this).
     // VTM seems to force explicit dual tree structure for small 4x4 blocks
-    bool is_separate_tree = depth == 4 || tree_type != UVG_BOTH_T;
+    bool is_separate_tree = tree_type != UVG_BOTH_T;
 
     const int lfnst_index = !is_separate_tree || color == COLOR_Y ? pred_cu->lfnst_idx : pred_cu->cr_lfnst_idx;
     assert((lfnst_index >= 0 && lfnst_index < 3) && "Invalid LFNST index.");
@@ -1692,6 +1634,8 @@ void uvg_encode_coding_tree(
       uvg_encode_intra_luma_coding_unit(state, cabac, cur_cu, cu_loc, NULL, NULL);
     }
 
+    const bool is_local_dual_tree = cu_height * cu_width < 64 && tree_type == UVG_BOTH_T;
+
     // Code chroma prediction mode.
     if (state->encoder_control->chroma_format != UVG_CSP_400 && depth != 4 && tree_type == UVG_BOTH_T) {
       encode_chroma_intra_cu(cabac, cur_cu, state->encoder_control->cfg.cclm, NULL);
@@ -1719,7 +1663,7 @@ void uvg_encode_coding_tree(
     }
 
     if (tree_type != UVG_CHROMA_T) {
-      bool lfnst_written = encode_lfnst_idx(state, cabac, cur_cu, x, y, depth, cu_width, cu_height, tree_type, COLOR_Y);
+      bool lfnst_written = encode_lfnst_idx(state, cabac, cur_cu, is_local_dual_tree ? UVG_LUMA_T : tree_type, COLOR_Y, cu_loc);
     }
     encode_mts_idx(state, cabac, cur_cu, cu_loc);
 
@@ -1735,7 +1679,7 @@ void uvg_encode_coding_tree(
       tmp->lfnst_last_scan_pos = false;
       encode_transform_coeff(state, &cu_loc, depth, 0, 1, coeff, tree_type, true, false, &luma_cbf_ctx, cu_loc);
       // Write LFNST only once for single tree structure
-      encode_lfnst_idx(state, cabac, tmp, x, y, depth, cu_width, cu_height, tree_type, COLOR_UV);
+      encode_lfnst_idx(state, cabac, tmp, is_local_dual_tree ? UVG_CHROMA_T : tree_type, COLOR_UV, cu_loc);
     }
   }
 
diff --git a/src/encode_coding_tree.h b/src/encode_coding_tree.h
index e75ad46a..a7fe896b 100644
--- a/src/encode_coding_tree.h
+++ b/src/encode_coding_tree.h
@@ -45,13 +45,9 @@ bool uvg_is_mts_allowed(const encoder_state_t* const state, cu_info_t* const pre
 bool uvg_is_lfnst_allowed(
   const encoder_state_t* const state,
   const cu_info_t* const pred_cu,
-  const int width,
-  const int height,
-  const int x,
-  const int y,
   enum uvg_tree_type tree_type,
   const color_t color,
-  const lcu_t* lcu);
+  const cu_loc_t* const cu_loc);
 
 void uvg_encode_coding_tree(
   encoder_state_t * const state,
diff --git a/src/search.c b/src/search.c
index 2874248e..3fd67cc2 100644
--- a/src/search.c
+++ b/src/search.c
@@ -674,8 +674,11 @@ static double cu_rd_cost_tr_split_accurate(
     }
   }
 
-  if(cu_loc->width == 4 || tree_type == UVG_LUMA_T) {
-    if (uvg_is_lfnst_allowed(state, tr_cu, width, height, cu_loc->local_x, cu_loc->local_y, tree_type, COLOR_Y, lcu)) {
+  const bool is_local_sep_tree = pred_cu->log2_width + pred_cu->log2_height < 6 && tree_type == UVG_BOTH_T;
+
+  if(is_local_sep_tree || tree_type == UVG_LUMA_T) {
+
+    if (uvg_is_lfnst_allowed(state, tr_cu, is_local_sep_tree ? UVG_LUMA_T : tree_type, COLOR_Y, cu_loc)) {
       const int lfnst_idx = tr_cu->lfnst_idx;
       CABAC_FBITS_UPDATE(
         cabac,
@@ -747,11 +750,12 @@ static double cu_rd_cost_tr_split_accurate(
     }
   }
 
-  if (uvg_is_lfnst_allowed(state, tr_cu, width, height, cu_loc->local_x, cu_loc->local_y, tree_type, cu_loc->width == 4 || tree_type == UVG_CHROMA_T ? COLOR_UV : COLOR_Y, lcu)) {
-    const int lfnst_idx = (cu_loc->width != 4 && tree_type != UVG_CHROMA_T) ? tr_cu->lfnst_idx : tr_cu->cr_lfnst_idx;
+  const bool is_chroma_tree = is_local_sep_tree || tree_type == UVG_CHROMA_T;
+  if (uvg_is_lfnst_allowed(state, tr_cu, is_local_sep_tree ? UVG_CHROMA_T : tree_type, is_chroma_tree ? COLOR_UV : COLOR_Y, cu_loc)) {
+    const int lfnst_idx = is_chroma_tree ? tr_cu->cr_lfnst_idx : tr_cu->lfnst_idx;
     CABAC_FBITS_UPDATE(
       cabac,
-      &cabac->ctx.lfnst_idx_model[tr_cu->depth == 4 || tree_type != UVG_BOTH_T],
+      &cabac->ctx.lfnst_idx_model[is_chroma_tree],
       lfnst_idx != 0,
       tr_tree_bits,
       "lfnst_idx");
@@ -975,6 +979,8 @@ static double search_cu(
   cur_cu->lfnst_idx = 0;
   cur_cu->joint_cb_cr = 0;
   cur_cu->split_tree = split_tree.split_tree;
+  cur_cu->log2_width = uvg_g_convert_to_log2[cu_width];
+  cur_cu->log2_height = uvg_g_convert_to_log2[cu_height];
 
   // If the CU is completely inside the frame at this depth, search for
   // prediction modes at this depth.
diff --git a/src/search_intra.c b/src/search_intra.c
index 6710b6fc..843836bf 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -300,7 +300,7 @@ static double search_intra_trdepth(
     pred_cu->tr_depth = depth;
 
     const bool mts_enabled = (state->encoder_control->cfg.mts == UVG_MTS_INTRA || state->encoder_control->cfg.mts == UVG_MTS_BOTH)
-      && tr_cu->depth == tr_cu->tr_depth;
+      && PU_IS_TU(pred_cu);
 
     nosplit_cost = 0.0;
 
@@ -330,10 +330,10 @@ static double search_intra_trdepth(
       num_transforms = pred_cu->intra.isp_mode == ISP_MODE_NO_ISP ? num_transforms : 1;
     }
     const int mts_start = trafo;
-    //TODO: height
-    if (state->encoder_control->cfg.trskip_enable && 
-        width <= (1 << state->encoder_control->cfg.trskip_max_size) /*&& height == 4*/ && 
-        pred_cu->intra.isp_mode == ISP_MODE_NO_ISP) { // tr_skip cannot be used wit ISP
+    if (state->encoder_control->cfg.trskip_enable 
+      && width <= (1 << state->encoder_control->cfg.trskip_max_size)
+      && height <= (1 << state->encoder_control->cfg.trskip_max_size)
+      && pred_cu->intra.isp_mode == ISP_MODE_NO_ISP) {
       num_transforms = MAX(num_transforms, 2);
     }
     pred_cu->intra.mode_chroma = -1;
@@ -346,9 +346,10 @@ static double search_intra_trdepth(
       max_lfnst_idx = 0;
     }
 
+    const bool is_local_dual_tree = pred_cu->log2_width + pred_cu->log2_height < 6 && tree_type == UVG_BOTH_T;
+
     int start_idx = 0;
-    int end_idx = state->encoder_control->cfg.lfnst && 
-                  depth == pred_cu->tr_depth &&
+    int end_idx = state->encoder_control->cfg.lfnst && PU_IS_TU(pred_cu) &&
                   uvg_can_use_isp_with_lfnst(width, height, pred_cu->intra.isp_mode, tree_type) ? max_lfnst_idx : 0;
     for (int i = start_idx; i < end_idx + 1; ++i) {
       search_data->lfnst_costs[i] = MAX_DOUBLE;
@@ -436,11 +437,11 @@ static double search_intra_trdepth(
           lcu,
           search_data->best_isp_cbfs);
         double transform_bits = 0;
-        if (state->encoder_control->cfg.lfnst && depth == pred_cu->tr_depth &&
+        if (state->encoder_control->cfg.lfnst && PU_IS_TU(pred_cu) &&
             trafo != MTS_SKIP) {
           if (!constraints[0] && constraints[1]) {
             transform_bits += CTX_ENTROPY_FBITS(
-              &state->search_cabac.ctx.lfnst_idx_model[tr_cu->depth == 4 ||
+              &state->search_cabac.ctx.lfnst_idx_model[is_local_dual_tree ||
                 tree_type == UVG_LUMA_T],
               lfnst_idx != 0);
             if (lfnst_idx > 0) {
@@ -566,30 +567,6 @@ static double search_intra_trdepth(
     }
 
     nosplit_cbf = pred_cu->cbf;
-
-    uvg_pixels_blit(
-      lcu->rec.y,
-      nosplit_pixels.y,
-      width,
-      width,
-      LCU_WIDTH,
-      width);
-    if (reconstruct_chroma) {
-      uvg_pixels_blit(
-        lcu->rec.u,
-        nosplit_pixels.u,
-        width_c,
-        width_c,
-        LCU_WIDTH_C,
-        width_c);
-      uvg_pixels_blit(
-        lcu->rec.v,
-        nosplit_pixels.v,
-        width_c,
-        width_c,
-        LCU_WIDTH_C,
-        width_c);
-    }
   }
     
   
@@ -619,31 +596,7 @@ static double search_intra_trdepth(
       uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x + half_width, cu_loc->y + half_height, half_width, half_height);
       split_cost += search_intra_trdepth(state, &split_cu_loc, max_depth, nosplit_cost, search_data, lcu, tree_type);
     }
-
-    double cbf_bits = 0.0;
-
-    // Add cost of cbf chroma bits on transform tree.
-    // All cbf bits are accumulated to pred_cu.cbf and cbf_is_set returns true
-    // if cbf is set at any level >= depth, so cbf chroma is assumed to be 0
-    // if this and any previous transform block has no chroma coefficients.
-    // When searching the first block we don't actually know the real values,
-    // so this will code cbf as 0 and not code the cbf at all for descendants.
-    if (state->encoder_control->chroma_format != UVG_CSP_400) {
-      const uint8_t tr_depth = depth - pred_cu->depth;
-      cabac_data_t* cabac = (cabac_data_t *)&state->search_cabac;
-
-      cabac_ctx_t* ctx = &(cabac->ctx.qt_cbf_model_cb[0]);
-      if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_U)) {
-        CABAC_FBITS_UPDATE(cabac, ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_U), cbf_bits, "cbf_cb");
-      }
-      ctx = &(state->cabac.ctx.qt_cbf_model_cr[cbf_is_set(pred_cu->cbf, depth, COLOR_U)]);
-      if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_V)) {
-        CABAC_FBITS_UPDATE(cabac, ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_V), cbf_bits, "cbf_cr");
-      }
-    }
-
-    double bits = cbf_bits;
-    split_cost += bits * state->lambda;
+    
   } else {
     assert(width <= TR_MAX_WIDTH);
   }
@@ -652,17 +605,6 @@ static double search_intra_trdepth(
     return split_cost;
   } else {
     uvg_lcu_fill_trdepth(lcu, cu_loc, depth, tree_type);
-
-    pred_cu->cbf = nosplit_cbf;
-
-    // We only restore the pixel data and not coefficients or cbf data.
-    // The only thing we really need are the border pixels.uvg_intra_get_dir_luma_predictor
-    uvg_pixels_blit(nosplit_pixels.y, lcu->rec.y, width, width, width, LCU_WIDTH);
-    if (reconstruct_chroma) {
-      uvg_pixels_blit(nosplit_pixels.u, lcu->rec.u, width_c, width_c, width_c, LCU_WIDTH_C);
-      uvg_pixels_blit(nosplit_pixels.v, lcu->rec.v, width_c, width_c, width_c, LCU_WIDTH_C);
-    }
-
     return nosplit_cost;
   }
 }
diff --git a/src/transform.c b/src/transform.c
index 86ff515b..2a532715 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -723,7 +723,7 @@ void uvg_chroma_transform_search(
         COEFF_ORDER_LINEAR);
     }
     if((depth == 4 || tree_type == UVG_CHROMA_T) && state->encoder_control->cfg.lfnst && 0) {
-      if(uvg_is_lfnst_allowed(state, pred_cu, width, height, 0, 0 , UVG_CHROMA_T, COLOR_UV, lcu)) {
+      if(uvg_is_lfnst_allowed(state, pred_cu, UVG_CHROMA_T, COLOR_UV, cu_loc)) {
         const int lfnst_idx = pred_cu->cr_lfnst_idx;
         CABAC_FBITS_UPDATE(
           &state->search_cabac,

From 89af7bda8e90b16e6d4931ca86410b4a2c05ac20 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 14 Sep 2022 08:45:09 +0300
Subject: [PATCH 095/254] [mtt] remove unnecessary depth dependency from split
 flag

---
 src/encode_coding_tree.c | 53 +++++++++++++++++++---------------------
 src/encode_coding_tree.h |  2 +-
 src/search.c             | 27 ++++++++++++++------
 3 files changed, 46 insertions(+), 36 deletions(-)

diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index 925d80da..5277768c 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -1242,14 +1242,14 @@ bool uvg_write_split_flag(
   cabac_data_t* cabac,
   const cu_info_t * left_cu,
   const cu_info_t * above_cu,
-  uint8_t split_flag,
   const cu_loc_t* const cu_loc,
+  const uint32_t split_tree,
   int depth,
   enum uvg_tree_type tree_type,
   double* bits_out)
 {
-  uint16_t abs_x = (cu_loc->x + state->tile->offset_x) >> (tree_type == UVG_CHROMA_T);
-  uint16_t abs_y = (cu_loc->y + state->tile->offset_y) >> (tree_type == UVG_CHROMA_T);
+  uint16_t abs_x = cu_loc->x + (state->tile->offset_x >> (tree_type == UVG_CHROMA_T));
+  uint16_t abs_y = cu_loc->y + (state->tile->offset_y >> (tree_type == UVG_CHROMA_T));
   double bits = 0;
   const encoder_control_t* const ctrl = state->encoder_control;
   // Implisit split flag when on border
@@ -1271,23 +1271,23 @@ bool uvg_write_split_flag(
   bool top_right_available = ((abs_x + cu_width - 1) < (ctrl->in.width >> (tree_type == UVG_CHROMA_T)));
 
   if (!bottom_left_available && !top_right_available && allow_qt) {
-    implicit_split_mode = UVG_QUAD_SPLIT;
+    implicit_split_mode = QT_SPLIT;
   }
   else if (!bottom_left_available && allow_btt) {
-    implicit_split_mode = UVG_HORZ_SPLIT;
+    implicit_split_mode = BT_HOR_SPLIT;
   }
   else if (!top_right_available && allow_btt) {
-    implicit_split_mode = UVG_VERT_SPLIT;
+    implicit_split_mode = BT_VER_SPLIT;
   }
   else if (!bottom_left_available || !top_right_available) {
-    implicit_split_mode = UVG_QUAD_SPLIT;
+    implicit_split_mode = QT_SPLIT;
   }
   
   // Check split conditions
   if (implicit_split_mode != UVG_NO_SPLIT) {
     no_split = th_split = tv_split = false;
-    bh_split = (implicit_split_mode == UVG_HORZ_SPLIT);
-    bv_split = (implicit_split_mode == UVG_VERT_SPLIT);
+    bh_split = (implicit_split_mode == BT_HOR_SPLIT);
+    bv_split = (implicit_split_mode == BT_VER_SPLIT);
   }
 
   if (!allow_btt) {
@@ -1296,17 +1296,18 @@ bool uvg_write_split_flag(
 
   bool allow_split = allow_qt | bh_split | bv_split | th_split | tv_split;
 
-  split_flag |= implicit_split_mode != UVG_NO_SPLIT;
+  int split_flag = (split_tree >> (depth * 3)) & 7;
+
+  split_flag = implicit_split_mode != UVG_NO_SPLIT ? implicit_split_mode : split_flag;
 
   int split_model = 0;
   if (no_split && allow_split) {
     // Get left and top block split_flags and if they are present and true, increase model number
-    // ToDo: should use height and width to increase model, PU_GET_W() ?
-    if (left_cu && left_cu->depth > depth) {
+    if (left_cu && (1 << left_cu->log2_height) < cu_height) {
       split_model++;
     }
 
-    if (above_cu && above_cu->depth > depth) {
+    if (above_cu && (1 << above_cu->log2_width) < cu_width) {
       split_model++;
     }
 
@@ -1322,15 +1323,11 @@ bool uvg_write_split_flag(
     split_model += 3 * (split_num >> 1);
 
     cabac->cur_ctx = &(cabac->ctx.split_flag_model[split_model]);
-    if(cabac->only_count && !split_flag) {
 
-      //printf("%hu %hu %d %d %d\n", state->search_cabac.ctx.split_flag_model[split_model].state[0], state->search_cabac.ctx.split_flag_model[split_model].state[1],
-      //  split_model, x, y);
-    }
-    CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.split_flag_model[split_model]), split_flag, bits, "split_flag");
+    CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.split_flag_model[split_model]), split_flag != 0, bits, "split_flag");
   }
 
-  bool qt_split = split_flag || implicit_split_mode == UVG_QUAD_SPLIT;
+  bool qt_split = split_flag == UVG_QUAD_SPLIT;
 
   if (!(implicit_split_mode == UVG_NO_SPLIT) && (allow_qt && allow_btt)) {
     split_model = (left_cu && GET_SPLITDATA(left_cu, depth)) + (above_cu && GET_SPLITDATA(above_cu, depth)) + (depth < 2 ? 0 : 3);
@@ -1342,17 +1339,17 @@ bool uvg_write_split_flag(
 
     split_model = 0;
 
-    // Get left and top block split_flags and if they are present and true, increase model number
-    if (left_cu && GET_SPLITDATA(left_cu, depth) == 1) {
+    // TODO: These are incorrect
+    if (left_cu && (1 << left_cu->log2_height) > cu_height) {
       split_model++;
     }
 
-    if (above_cu && GET_SPLITDATA(above_cu, depth) == 1) {
+    if (above_cu && (1 << above_cu->log2_width) > cu_width) {
       split_model++;
     }
 
     split_model += (depth > 2 ? 0 : 3);
-    CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_split_flag_model[split_model]), split_flag, bits, "split_cu_mode");
+    CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_split_flag_model[split_model]), qt_split, bits, "split_cu_mode");
   }
   if (bits_out) *bits_out += bits;
   return split_flag;
@@ -1414,9 +1411,9 @@ void uvg_encode_coding_tree(
       state,
       cabac,
       left_cu,
-      above_cu,
-      (cur_cu->split_tree >> (split_tree.current_depth * 3)) & 7,
+      above_cu, 
       cu_loc,
+      cur_cu->split_tree,
       depth,
       tree_type,
       NULL);
@@ -1725,7 +1722,7 @@ double uvg_mock_encode_coding_unit(
       left_cu = LCU_GET_CU_AT_PX(lcu, x_local - 1, y_local);
     }
     else {
-      left_cu = uvg_cu_array_at_const(state->tile->frame->chroma_cu_array, (x >> 1) - 1, y >> 1);
+      left_cu = uvg_cu_array_at_const(state->tile->frame->chroma_cu_array, x  - 1, y);
     }
   }
   if (y) {
@@ -1733,7 +1730,7 @@ double uvg_mock_encode_coding_unit(
       above_cu = LCU_GET_CU_AT_PX(lcu, x_local, y_local-1);
     }
     else {
-      above_cu = uvg_cu_array_at_const(state->tile->frame->chroma_cu_array, x >> 1, (y >> 1) - 1);
+      above_cu = uvg_cu_array_at_const(state->tile->frame->chroma_cu_array, x, y - 1);
     }
   }
   
@@ -1748,8 +1745,8 @@ double uvg_mock_encode_coding_unit(
       cabac,
       left_cu,
       above_cu,
-      0,
       cu_loc,
+      cur_cu->split_tree,
       depth,
       tree_type,
       &bits);
diff --git a/src/encode_coding_tree.h b/src/encode_coding_tree.h
index a7fe896b..86605e4d 100644
--- a/src/encode_coding_tree.h
+++ b/src/encode_coding_tree.h
@@ -101,8 +101,8 @@ bool uvg_write_split_flag(
   cabac_data_t* cabac,
   const cu_info_t* left_cu,
   const cu_info_t* above_cu,
-  uint8_t split_flag,
   const cu_loc_t* const cu_loc,
+  const uint32_t split_tree,
   int depth,
   enum uvg_tree_type tree_type,
   double* bits_out);
diff --git a/src/search.c b/src/search.c
index 3fd67cc2..fb28085d 100644
--- a/src/search.c
+++ b/src/search.c
@@ -193,6 +193,9 @@ static void lcu_fill_cu_info(lcu_t *lcu, int x_local, int y_local, int width, in
       to->violates_lfnst_constrained_luma = cu->violates_lfnst_constrained_luma;
       to->violates_lfnst_constrained_chroma = cu->violates_lfnst_constrained_chroma;
 
+      to->log2_height = cu->log2_height;
+      to->log2_width = cu->log2_width;
+
       if (cu->type == CU_INTRA) {
         to->intra.mode        = cu->intra.mode;
         to->intra.mode_chroma = cu->intra.mode_chroma;
@@ -1256,6 +1259,14 @@ static double search_cu(
   }
 
   if (cur_cu->type == CU_INTRA || cur_cu->type == CU_INTER || cur_cu->type == CU_IBC) {
+  // The cabac functions assume chroma locations whereas the search uses luma locations
+  // for the chroma tree, therefore we need to shift the chroma coordinates here for
+  // passing to the bit cost calculating functions.
+  cu_loc_t chroma_loc = *cu_loc;
+  chroma_loc.y >>= 1;
+  chroma_loc.x >>= 1;
+
+  if (cur_cu->type == CU_INTRA || cur_cu->type == CU_INTER) {
     double bits = 0;
     cabac_data_t* cabac  = &state->search_cabac;
     cabac->update = 1;
@@ -1263,7 +1274,9 @@ static double search_cu(
     bits += uvg_mock_encode_coding_unit(
       state,
       cabac,
-      cu_loc, lcu, cur_cu,
+      tree_type != UVG_CHROMA_T ? cu_loc : &chroma_loc, 
+      lcu,
+      cur_cu,
       tree_type);
 
     
@@ -1310,6 +1323,8 @@ static double search_cu(
 
   // Recursively split all the way to max search depth.
   if (can_split_cu) {
+    const split_tree_t new_split = { split_tree.split_tree | QT_SPLIT << (split_tree.current_depth * 3), split_tree.current_depth + 1 };
+
     int half_cu = cu_width >> (tree_type != UVG_CHROMA_T);
     double split_cost = 0.0;
     int cbf = cbf_is_set_any(cur_cu->cbf, split_tree.current_depth);
@@ -1345,9 +1360,9 @@ static double search_cu(
         state,
         &state->search_cabac,
         left_cu,
-        above_cu,
-        1,
-        cu_loc,
+        above_cu, 
+        tree_type != UVG_CHROMA_T ? cu_loc : &chroma_loc,
+        new_split.split_tree,
         depth,
         tree_type,
         &split_bits);
@@ -1362,7 +1377,6 @@ static double search_cu(
     // It is ok to interrupt the search as soon as it is known that
     // the split costs at least as much as not splitting.
     if (cur_cu->type == CU_NOTSET || cbf || state->encoder_control->cfg.cu_split_termination == UVG_CU_SPLIT_TERMINATION_OFF) {
-      const split_tree_t new_split = { split_tree.split_tree | QT_SPLIT << (split_tree.current_depth * 3), split_tree.current_depth + 1};
       cu_loc_t new_cu_loc;
       if (split_cost < cost) {
         uvg_cu_loc_ctor(&new_cu_loc, x, y, half_cu, half_cu);
@@ -1407,8 +1421,7 @@ static double search_cu(
         double bits = 0;
         uvg_write_split_flag(state, &state->search_cabac,
                              x > 0 ? LCU_GET_CU_AT_PX(lcu, SUB_SCU(x) - 1, SUB_SCU(y)) : NULL,
-                             y > 0 ? LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y) - 1) : NULL,
-                             0, cu_loc, depth, tree_type, &bits);
+                             y > 0 ? LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y) - 1) : NULL, cu_loc, cur_cu->split_tree, depth, tree_type, &bits);
 
         cur_cu->intra = cu_d1->intra;
         cur_cu->type = CU_INTRA;

From e3dbeda7f7d07b76ac3aed900939874a7f85e499 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 14 Sep 2022 11:29:47 +0300
Subject: [PATCH 096/254] [mtt] remove dependency to depth from deblock

---
 src/filter.c | 48 ++++++++++++++++++++++++++++++------------------
 1 file changed, 30 insertions(+), 18 deletions(-)

diff --git a/src/filter.c b/src/filter.c
index b366dd4e..a89002d2 100644
--- a/src/filter.c
+++ b/src/filter.c
@@ -276,10 +276,11 @@ static bool is_tu_boundary(
   // if (x & 3 || y & 3) return false;
   const cu_info_t *const scu =
     uvg_cu_array_at_const(tree_type != UVG_CHROMA_T ? state->tile->frame->cu_array : state->tile->frame->chroma_cu_array, x, y);
-  const int tu_width = LCU_WIDTH >> (scu->tr_depth + (tree_type == UVG_CHROMA_T));
+  const int tu_width = MIN(TR_MAX_WIDTH, 1 << scu->log2_width);
+  const int tu_height = MIN(TR_MAX_WIDTH, 1 << scu->log2_height);
 
   if (dir == EDGE_HOR) {
-    return (y & (tu_width - 1)) == 0;
+    return (y & (tu_height - 1)) == 0;
   } else {
     return (x & (tu_width - 1)) == 0;
   }
@@ -854,15 +855,20 @@ static void filter_deblock_edge_luma(encoder_state_t * const state,
       bool is_side_Q_large = false;
       uint8_t max_filter_length_P = 0;
       uint8_t max_filter_length_Q = 0;
-      const int cu_size = LCU_WIDTH >> cu_q->depth;
-      // TODO: NON square
-      const int pu_size = dir == EDGE_HOR ? cu_size : cu_size;
-      const int pu_pos = dir == EDGE_HOR ? y_coord 
-                                         : x_coord;
+
+      const int cu_width = 1 << cu_q->log2_width;
+      const int cu_height = 1 << cu_q->log2_height;
+      const int pu_size = dir == EDGE_HOR ? cu_height : cu_width;
+      const int pu_pos = dir == EDGE_HOR ? y_coord : x_coord;
+
+
+      const int tu_size_p_side = dir == EDGE_HOR ? MIN(1 << cu_p->log2_height, TR_MAX_WIDTH) : MIN(1 << cu_p->log2_width, TR_MAX_WIDTH);
+      const int tu_size_q_side = dir == EDGE_HOR ? MIN(1 << cu_q->log2_height, TR_MAX_WIDTH) : MIN(1 << cu_q->log2_width, TR_MAX_WIDTH);
+
       get_max_filter_length(&max_filter_length_P, &max_filter_length_Q, state, x_coord, y_coord,
                             dir, tu_boundary,
-                            LCU_WIDTH >> cu_p->tr_depth,
-                            LCU_WIDTH >> cu_q->tr_depth,
+                            tu_size_p_side,
+                            tu_size_q_side,
                             pu_pos, pu_size, cu_q->merged, COLOR_Y,
                             UVG_LUMA_T);
 
@@ -1083,19 +1089,25 @@ static void filter_deblock_edge_chroma(encoder_state_t * const state,
         cu_p = uvg_cu_array_at(cua, x_coord, y_coord - 1);
         cu_q = uvg_cu_array_at(cua, x_coord, y_coord    );
       }
-
-      const int cu_size = LCU_WIDTH >> (cu_q->depth + (tree_type == UVG_CHROMA_T));
-      // TODO: non-square
-      const int pu_size = dir == EDGE_HOR ? cu_size : cu_size;
-      const int pu_pos = dir == EDGE_HOR ? y_coord
-                                         : x_coord;
+      
       uint8_t max_filter_length_P = 0;
       uint8_t max_filter_length_Q = 0;
       
-      const int tu_p_size = LCU_WIDTH >> (cu_p->tr_depth + (chroma_shift));
-      const int tu_q_size = LCU_WIDTH >> (cu_q->tr_depth + (chroma_shift));
+      const int cu_width = 1 << (cu_q->log2_width - (tree_type != UVG_CHROMA_T));
+      const int cu_height = 1 << (cu_q->log2_height - (tree_type != UVG_CHROMA_T));
+      const int pu_size = dir == EDGE_HOR ? cu_height : cu_width;
+      const int pu_pos = dir == EDGE_HOR ? y_coord : x_coord;
+
+
+      const int tu_size_p_side = dir == EDGE_HOR ? 
+        MIN(1 << (cu_p->log2_height - (tree_type != UVG_CHROMA_T)), TR_MAX_WIDTH) :
+        MIN(1 << (cu_p->log2_width - (tree_type != UVG_CHROMA_T)), TR_MAX_WIDTH);
+      const int tu_size_q_side = dir == EDGE_HOR ?
+        MIN(1 << (cu_q->log2_height - (tree_type != UVG_CHROMA_T)), TR_MAX_WIDTH) :
+        MIN(1 << (cu_q->log2_width - (tree_type != UVG_CHROMA_T)), TR_MAX_WIDTH);
+
       get_max_filter_length(&max_filter_length_P, &max_filter_length_Q, state, x_coord, y_coord,
-                            dir, tu_boundary, tu_p_size, tu_q_size,
+                            dir, tu_boundary, tu_size_p_side, tu_size_q_side,
                             pu_pos, pu_size, cu_q->merged, COLOR_U,
                             tree_type);
 

From 9a29d9ded3fcfa2bdfe07c664f8db061c5c69a2d Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 14 Sep 2022 11:47:26 +0300
Subject: [PATCH 097/254] [mtt] remove depth from cbf

---
 src/cu.h                 | 37 +++++++++++++++++------------------
 src/encode_coding_tree.c | 22 ++++++++++-----------
 src/encoderstate.c       |  8 ++++----
 src/filter.c             | 12 ++++++------
 src/intra.c              | 22 ++++++++++-----------
 src/rdo.c                |  2 +-
 src/search.c             | 42 ++++++++++++++++++++--------------------
 src/search_inter.c       | 18 ++++++++---------
 src/search_intra.c       | 12 ++++++------
 src/transform.c          | 22 ++++++++++-----------
 10 files changed, 98 insertions(+), 99 deletions(-)

diff --git a/src/cu.h b/src/cu.h
index 87d31f33..77965072 100644
--- a/src/cu.h
+++ b/src/cu.h
@@ -132,6 +132,8 @@ typedef struct
 
   uint16_t cbf;
 
+  uint8_t root_cbf;
+
   uint32_t split_tree : 3 * 9;
 
   /**
@@ -536,34 +538,31 @@ static INLINE unsigned xy_to_zorder(unsigned width, unsigned x, unsigned y)
 } while(0)
 
 
-#define NUM_CBF_DEPTHS 5
-static const uint16_t cbf_masks[NUM_CBF_DEPTHS] = { 0x1f, 0x0f, 0x07, 0x03, 0x1 };
-
 /**
  * Check if CBF in a given level >= depth is true.
  */
-static INLINE int cbf_is_set(uint16_t cbf, int depth, color_t plane)
+static INLINE int cbf_is_set(uint16_t cbf, color_t plane)
 {
-  return (cbf & (cbf_masks[depth] << (NUM_CBF_DEPTHS * plane))) != 0;
+  return (cbf & (1 << (plane))) != 0;
 }
 
 /**
  * Check if CBF in a given level >= depth is true.
  */
-static INLINE int cbf_is_set_any(uint16_t cbf, int depth)
+static INLINE int cbf_is_set_any(uint16_t cbf)
 {
-  return cbf_is_set(cbf, depth, COLOR_Y) ||
-         cbf_is_set(cbf, depth, COLOR_U) ||
-         cbf_is_set(cbf, depth, COLOR_V);
+  return cbf_is_set(cbf, COLOR_Y) ||
+         cbf_is_set(cbf, COLOR_U) ||
+         cbf_is_set(cbf, COLOR_V);
 }
 
 /**
  * Set CBF in a level to true.
  */
-static INLINE void cbf_set(uint16_t *cbf, int depth, color_t plane)
+static INLINE void cbf_set(uint16_t *cbf, color_t plane)
 {
   // Return value of the bit corresponding to the level.
-  *cbf |= (0x10 >> depth) << (NUM_CBF_DEPTHS * plane);
+  *cbf |= (1) << (plane);
 }
 
 /**
@@ -572,20 +571,20 @@ static INLINE void cbf_set(uint16_t *cbf, int depth, color_t plane)
  */
 static INLINE void cbf_set_conditionally(uint16_t *cbf, uint16_t child_cbfs[3], int depth, color_t plane)
 {
-  bool child_cbf_set = cbf_is_set(child_cbfs[0], depth + 1, plane) ||
-                       cbf_is_set(child_cbfs[1], depth + 1, plane) ||
-                       cbf_is_set(child_cbfs[2], depth + 1, plane);
+  bool child_cbf_set = cbf_is_set(child_cbfs[0], plane) ||
+                       cbf_is_set(child_cbfs[1], plane) ||
+                       cbf_is_set(child_cbfs[2], plane);
   if (child_cbf_set) {
-    cbf_set(cbf, depth, plane);
+    cbf_set(cbf, plane);
   }
 }
 
 /**
  * Set CBF in a levels <= depth to false.
  */
-static INLINE void cbf_clear(uint16_t *cbf, int depth, color_t plane)
+static INLINE void cbf_clear(uint16_t *cbf, color_t plane)
 {
-  *cbf &= ~(cbf_masks[depth] << (NUM_CBF_DEPTHS * plane));
+  *cbf &= ~(1 << (plane));
 }
 
 /**
@@ -593,8 +592,8 @@ static INLINE void cbf_clear(uint16_t *cbf, int depth, color_t plane)
  */
 static INLINE void cbf_copy(uint16_t *cbf, uint16_t src, color_t plane)
 {
-  cbf_clear(cbf, 0, plane);
-  *cbf |= src & (cbf_masks[0] << (NUM_CBF_DEPTHS * plane));
+  cbf_clear(cbf, plane);
+  *cbf |= src & (1 <<  plane);
 }
 
 #define GET_SPLITDATA(CU,curDepth) (((CU)->split_tree >> (curDepth)) & 7)
diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index 5277768c..c7a314b9 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -466,7 +466,7 @@ static void encode_chroma_tu(
     uvg_get_sub_coeff(coeff_u, coeff->u, x_local, y_local, cu_loc->chroma_width, cu_loc->chroma_height, LCU_WIDTH_C);
     uvg_get_sub_coeff(coeff_v, coeff->v, x_local, y_local, cu_loc->chroma_width, cu_loc->chroma_height, LCU_WIDTH_C);
 
-    if (cbf_is_set(cur_pu->cbf, depth, COLOR_U)) {
+    if (cbf_is_set(cur_pu->cbf, COLOR_U)) {
       // TODO: height for this check and the others below
       if(state->encoder_control->cfg.trskip_enable && width_c <= (1 << state->encoder_control->cfg.trskip_max_size)){
         cabac->cur_ctx = &cabac->ctx.transform_skip_model_chroma;
@@ -477,7 +477,7 @@ static void encode_chroma_tu(
       uvg_encode_coeff_nxn(state, &state->cabac, coeff_u, cu_loc, COLOR_U, *scan_idx, cur_pu, NULL);
     }
 
-    if (cbf_is_set(cur_pu->cbf, depth, COLOR_V)) {
+    if (cbf_is_set(cur_pu->cbf, COLOR_V)) {
       if (state->encoder_control->cfg.trskip_enable && width_c <= (1 << state->encoder_control->cfg.trskip_max_size)) {
         cabac->cur_ctx = &cabac->ctx.transform_skip_model_chroma;
         CABAC_BIN(cabac, (cur_pu->tr_skip >> COLOR_V) & 1, "transform_skip_flag");
@@ -526,7 +526,7 @@ static void encode_transform_unit(
 
   int8_t scan_idx = uvg_get_scan_order(cur_pu->type, cur_pu->intra.mode, depth);
 
-  int cbf_y = cbf_is_set(cur_pu->cbf, depth, COLOR_Y);
+  int cbf_y = cbf_is_set(cur_pu->cbf, COLOR_Y);
 
   if (cbf_y && !only_chroma) {
     int x_local = x % LCU_WIDTH;
@@ -573,8 +573,8 @@ static void encode_transform_unit(
     }
   }
 
-  bool chroma_cbf_set = cbf_is_set(cur_pu->cbf, depth, COLOR_U) ||
-                        cbf_is_set(cur_pu->cbf, depth, COLOR_V);
+  bool chroma_cbf_set = cbf_is_set(cur_pu->cbf, COLOR_U) ||
+                        cbf_is_set(cur_pu->cbf, COLOR_V);
   if ((chroma_cbf_set || joint_chroma) && last_split) {
     //Need to drop const to get lfnst constraints
     // Use original dimensions instead of ISP split dimensions
@@ -644,9 +644,9 @@ static void encode_transform_coeff(
 
   int8_t split = (cu_loc->width > TR_MAX_WIDTH || cu_loc->height > TR_MAX_WIDTH);
 
-  const int cb_flag_y = tree_type != UVG_CHROMA_T ? cbf_is_set(cur_pu->cbf, depth, COLOR_Y) : 0;
-  const int cb_flag_u = tree_type != UVG_LUMA_T ?( cur_pu->joint_cb_cr ? (cur_pu->joint_cb_cr >> 1) & 1 : cbf_is_set(cur_cu->cbf, depth, COLOR_U)) : 0;
-  const int cb_flag_v = tree_type != UVG_LUMA_T ? (cur_pu->joint_cb_cr ? cur_pu->joint_cb_cr & 1 : cbf_is_set(cur_cu->cbf, depth, COLOR_V)) : 0;
+  const int cb_flag_y = tree_type != UVG_CHROMA_T ? cbf_is_set(cur_pu->cbf, COLOR_Y) : 0;
+  const int cb_flag_u = tree_type != UVG_LUMA_T ?( cur_pu->joint_cb_cr ? (cur_pu->joint_cb_cr >> 1) & 1 : cbf_is_set(cur_cu->cbf, COLOR_U)) : 0;
+  const int cb_flag_v = tree_type != UVG_LUMA_T ? (cur_pu->joint_cb_cr ? cur_pu->joint_cb_cr & 1 : cbf_is_set(cur_cu->cbf, COLOR_V)) : 0;
 
   // The split_transform_flag is not signaled when:
   // - transform size is greater than 32 (depth == 0)
@@ -1610,15 +1610,15 @@ void uvg_encode_coding_tree(
     }
 
     {
-      int cbf = cbf_is_set_any(cur_cu->cbf, depth);
       // Only need to signal coded block flag if not skipped or merged
       // skip = no coded residual, merge = coded residual
+      const bool has_coeffs = cur_pu->root_cbf || cur_pu->cbf;
       if (!cur_cu->merged) {
         cabac->cur_ctx = &(cabac->ctx.cu_qt_root_cbf_model);
-        CABAC_BIN(cabac, cbf, "rqt_root_cbf");
+        CABAC_BIN(cabac, has_coeffs, "rqt_root_cbf");
       }
       // Code (possible) coeffs to bitstream
-      if (cbf) {
+      if (has_coeffs) {
         int luma_cbf_ctx = 0;
         encode_transform_coeff(state, cu_loc, depth, 0, 0, 0, 0, coeff, tree_type, true, false, &luma_cbf_ctx, cu_loc);
       }
diff --git a/src/encoderstate.c b/src/encoderstate.c
index a42ce424..f2db175f 100644
--- a/src/encoderstate.c
+++ b/src/encoderstate.c
@@ -667,12 +667,12 @@ static void set_cu_qps(encoder_state_t *state, const cu_loc_t* const cu_loc, int
       for (int y_scu = cu_loc->y; !cbf_found && y_scu < y_limit; y_scu += tu_width) {
         for (int x_scu = cu_loc->x; !cbf_found && x_scu < x_limit; x_scu += tu_width) {
           cu_info_t *tu = uvg_cu_array_at(state->tile->frame->cu_array, x_scu, y_scu);
-          if (cbf_is_set_any(tu->cbf, cu->depth)) {
+          if (cbf_is_set_any(tu->cbf)) {
             cbf_found = true;
           }
         }
       }
-    } else if (cbf_is_set_any(cu->cbf, cu->depth)) {
+    } else if (cbf_is_set_any(cu->cbf)) {
       cbf_found = true;
     }
 
@@ -2045,9 +2045,9 @@ static void _encode_one_frame_add_bitstream_deps(const encoder_state_t * const s
 void uvg_encode_one_frame(encoder_state_t * const state, uvg_picture* frame)
 {
 #if UVG_DEBUG_PRINT_CABAC == 1
-  uvg_cabac_bins_count = 0;
+  // uvg_cabac_bins_count = 0;
   if (state->frame->num == 0) uvg_cabac_bins_verbose = true;
-  else uvg_cabac_bins_verbose = false;
+  // else uvg_cabac_bins_verbose = false;
 #endif
 
 
diff --git a/src/filter.c b/src/filter.c
index a89002d2..c0ee7c68 100644
--- a/src/filter.c
+++ b/src/filter.c
@@ -757,8 +757,8 @@ static void filter_deblock_edge_luma(encoder_state_t * const state,
           cu_q = uvg_cu_array_at(frame->cu_array, x_coord, y);
         }
 
-        bool nonzero_coeffs = cbf_is_set(cu_q->cbf, cu_q->tr_depth, COLOR_Y)
-          || cbf_is_set(cu_p->cbf, cu_p->tr_depth, COLOR_Y);
+        bool nonzero_coeffs = cbf_is_set(cu_q->cbf, COLOR_Y)
+          || cbf_is_set(cu_p->cbf, COLOR_Y);
 
         // Filter strength
         strength = 0;
@@ -1122,10 +1122,10 @@ static void filter_deblock_edge_chroma(encoder_state_t * const state,
         c_strength[1] = 2;
       }
       else if (tu_boundary){ //TODO: Add ciip/IBC related stuff
-        bool nonzero_coeffs_U = cbf_is_set(cu_q->cbf, cu_q->tr_depth, COLOR_U)
-                                || cbf_is_set(cu_p->cbf, cu_p->tr_depth, COLOR_U);
-        bool nonzero_coeffs_V = cbf_is_set(cu_q->cbf, cu_q->tr_depth, COLOR_V)
-                                || cbf_is_set(cu_p->cbf, cu_p->tr_depth, COLOR_V);
+        bool nonzero_coeffs_U = cbf_is_set(cu_q->cbf, COLOR_U)
+                                || cbf_is_set(cu_p->cbf, COLOR_U);
+        bool nonzero_coeffs_V = cbf_is_set(cu_q->cbf, COLOR_V)
+                                || cbf_is_set(cu_p->cbf, COLOR_V);
         c_strength[0] = nonzero_coeffs_U ? 1 : 0;
         c_strength[1] = nonzero_coeffs_V ? 1 : 0;
       }
diff --git a/src/intra.c b/src/intra.c
index 8f87104f..6fb96aea 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -1792,11 +1792,11 @@ void uvg_intra_recon_cu(
   // Reset CBFs because CBFs might have been set
   // for depth earlier
   if (recon_luma) {
-    cbf_clear(&cur_cu->cbf, depth, COLOR_Y);
+    cbf_clear(&cur_cu->cbf, COLOR_Y);
   }
   if (recon_chroma) {
-    cbf_clear(&cur_cu->cbf, depth, COLOR_U);
-    cbf_clear(&cur_cu->cbf, depth, COLOR_V);
+    cbf_clear(&cur_cu->cbf, COLOR_U);
+    cbf_clear(&cur_cu->cbf, COLOR_V);
   }
 
   if (width > TR_MAX_WIDTH || height > TR_MAX_WIDTH) {
@@ -1820,13 +1820,13 @@ void uvg_intra_recon_cu(
       LCU_GET_CU_AT_PX(lcu, (lcu_px.x + half_width) >> (tree_type == UVG_CHROMA_T), (lcu_px.y + half_height) >> (tree_type == UVG_CHROMA_T))->cbf,
     };
 
-    if (recon_luma && depth <= MAX_DEPTH) {
-      cbf_set_conditionally(&cur_cu->cbf, child_cbfs, depth, COLOR_Y);
-    }
-    if (recon_chroma && depth <= MAX_DEPTH) {
-      cbf_set_conditionally(&cur_cu->cbf, child_cbfs, depth, COLOR_U);
-      cbf_set_conditionally(&cur_cu->cbf, child_cbfs, depth, COLOR_V);
-    }
+    //if (recon_luma && depth <= MAX_DEPTH) {
+    //  cbf_set_conditionally(&cur_cu->cbf, child_cbfs, depth, COLOR_Y);
+    //}
+    //if (recon_chroma && depth <= MAX_DEPTH) {
+    //  cbf_set_conditionally(&cur_cu->cbf, child_cbfs, depth, COLOR_U);
+    //  cbf_set_conditionally(&cur_cu->cbf, child_cbfs, depth, COLOR_V);
+    //}
     return;
   }
   if (search_data->pred_cu.intra.isp_mode != ISP_MODE_NO_ISP && recon_luma ) {
@@ -1848,7 +1848,7 @@ void uvg_intra_recon_cu(
       uvg_quantize_lcu_residual(state, true, false, false,
         &tu_loc, depth, cur_cu, lcu,
         false, tree_type);
-      search_data->best_isp_cbfs |= cbf_is_set(cur_cu->cbf, depth, COLOR_Y) << i;
+      search_data->best_isp_cbfs |= cbf_is_set(cur_cu->cbf, COLOR_Y) << i;
     }
   }
   const bool has_luma = recon_luma && search_data->pred_cu.intra.isp_mode == ISP_MODE_NO_ISP;
diff --git a/src/rdo.c b/src/rdo.c
index 262b4f83..e3b3bff6 100644
--- a/src/rdo.c
+++ b/src/rdo.c
@@ -1732,7 +1732,7 @@ void uvg_rdoq(
         assert(0);
     }
     // This cbf should work even with non-square blocks
-    ctx_cbf    = ( color != COLOR_V ? 0 : cbf_is_set(cbf, 5 - uvg_math_floor_log2(width), COLOR_U));
+    ctx_cbf    = ( color != COLOR_V ? 0 : cbf_is_set(cbf, COLOR_U));
     best_cost  = block_uncoded_cost +  lambda * CTX_ENTROPY_BITS(&base_cbf_model[ctx_cbf],0);
     base_cost +=   lambda * CTX_ENTROPY_BITS(&base_cbf_model[ctx_cbf],1);
   }
diff --git a/src/search.c b/src/search.c
index fb28085d..b741cbc7 100644
--- a/src/search.c
+++ b/src/search.c
@@ -361,11 +361,11 @@ double uvg_cu_rd_cost_luma(
   if (pred_cu->type == CU_INTER || pred_cu->intra.isp_mode == ISP_MODE_NO_ISP) {
     const int depth = 6 - uvg_g_convert_to_log2[cu_loc->width];
     const int is_tr_split = tr_cu->tr_depth - tr_cu->depth;
-    int is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_Y);
+    int is_set = cbf_is_set(pred_cu->cbf, COLOR_Y);
     if (pred_cu->type == CU_INTRA ||
       is_tr_split ||
-      cbf_is_set(tr_cu->cbf, depth, COLOR_U) ||
-      cbf_is_set(tr_cu->cbf, depth, COLOR_V))
+      cbf_is_set(tr_cu->cbf, COLOR_U) ||
+      cbf_is_set(tr_cu->cbf, COLOR_V))
     {
       cabac_ctx_t* ctx = &(cabac->ctx.qt_cbf_model_luma[0]);
 
@@ -455,8 +455,8 @@ double uvg_cu_rd_cost_chroma(
   }
 
   const int depth = 6 - uvg_g_convert_to_log2[cu_loc->width];
-  int u_is_set = pred_cu->joint_cb_cr ? (pred_cu->joint_cb_cr & 2) >> 1 : cbf_is_set(pred_cu->cbf, depth, COLOR_U);
-  int v_is_set = pred_cu->joint_cb_cr ? (pred_cu->joint_cb_cr & 1) : cbf_is_set(pred_cu->cbf, depth, COLOR_V);
+  int u_is_set = pred_cu->joint_cb_cr ? (pred_cu->joint_cb_cr & 2) >> 1 : cbf_is_set(pred_cu->cbf, COLOR_U);
+  int v_is_set = pred_cu->joint_cb_cr ? (pred_cu->joint_cb_cr & 1) : cbf_is_set(pred_cu->cbf, COLOR_V);
 
   if (cu_loc->width > TR_MAX_WIDTH || cu_loc->height > TR_MAX_WIDTH) {
     double sum = 0;
@@ -482,11 +482,11 @@ double uvg_cu_rd_cost_chroma(
     cabac_data_t* cabac = (cabac_data_t*)&state->search_cabac;
     cabac_ctx_t* ctx = &(cabac->ctx.qt_cbf_model_cb[0]);
     cabac->cur_ctx = ctx;
-    if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_U)) {
+    if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, COLOR_U)) {
       CABAC_FBITS_UPDATE(cabac, ctx, u_is_set, tr_tree_bits, "cbf_cb_search");
     }
     ctx = &(cabac->ctx.qt_cbf_model_cr[u_is_set]);
-    if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_V)) {
+    if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, COLOR_V)) {
       CABAC_FBITS_UPDATE(cabac, ctx, v_is_set, tr_tree_bits, "cbf_cb_search");
     }
   }
@@ -558,13 +558,13 @@ static double cu_rd_cost_tr_split_accurate(
 
   const int depth = 6 - uvg_g_convert_to_log2[cu_loc->width];
   
-  const int cb_flag_u = tr_cu->joint_cb_cr ? tr_cu->joint_cb_cr >> 1 : cbf_is_set(tr_cu->cbf, depth, COLOR_U);
-  const int cb_flag_v = tr_cu->joint_cb_cr ? tr_cu->joint_cb_cr & 1 : cbf_is_set(tr_cu->cbf, depth, COLOR_V);
+  const int cb_flag_u = tr_cu->joint_cb_cr ? tr_cu->joint_cb_cr >> 1 : cbf_is_set(tr_cu->cbf, COLOR_U);
+  const int cb_flag_v = tr_cu->joint_cb_cr ? tr_cu->joint_cb_cr & 1 : cbf_is_set(tr_cu->cbf, COLOR_V);
 
   cabac_data_t* cabac = (cabac_data_t*)&state->search_cabac;
 
   {
-    int cbf = cbf_is_set_any(tr_cu->cbf, depth);
+    int cbf = cbf_is_set_any(tr_cu->cbf);
     // Only need to signal coded block flag if not skipped or merged
     // skip = no coded residual, merge = coded residual
     if (pred_cu->type != CU_INTRA && (!pred_cu->merged)) {
@@ -596,7 +596,7 @@ static double cu_rd_cost_tr_split_accurate(
     CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_cbf_model_cr[cb_flag_u]), cb_flag_v, tr_tree_bits, "cbf_cr");    
   }
 
-  const int cb_flag_y = cbf_is_set(tr_cu->cbf, depth, COLOR_Y) && tree_type != UVG_CHROMA_T;
+  const int cb_flag_y = cbf_is_set(tr_cu->cbf, COLOR_Y) && tree_type != UVG_CHROMA_T;
 
   const bool is_isp = !(pred_cu->type == CU_INTER || pred_cu->intra.isp_mode == ISP_MODE_NO_ISP);
   // Add transform_tree cbf_luma bit cost.
@@ -1182,8 +1182,8 @@ static double search_cu(
       const int split_type = intra_search.pred_cu.intra.isp_mode;
       const int split_num = split_type == ISP_MODE_NO_ISP ? 0 : uvg_get_isp_split_num(cu_width, cu_height, split_type, true);
 
-      const int cbf_cb = cbf_is_set(cur_cu->cbf, split_tree.current_depth, COLOR_U);
-      const int cbf_cr = cbf_is_set(cur_cu->cbf, split_tree.current_depth, COLOR_V);
+      const int cbf_cb = cbf_is_set(cur_cu->cbf, COLOR_U);
+      const int cbf_cr = cbf_is_set(cur_cu->cbf, COLOR_V);
       const int jccr = cur_cu->joint_cb_cr;
       for (int i = 0; i < split_num; ++i) {
         cu_loc_t isp_loc;
@@ -1195,14 +1195,14 @@ static double search_cu(
         uvg_get_isp_cu_arr_coords(&tmp_x, &tmp_y);
         cu_info_t* split_cu = LCU_GET_CU_AT_PX(lcu, tmp_x % LCU_WIDTH, tmp_y % LCU_WIDTH);
         bool cur_cbf = (intra_search.best_isp_cbfs >> i) & 1;
-        cbf_clear(&split_cu->cbf, split_tree.current_depth, COLOR_Y);
-        cbf_clear(&split_cu->cbf, split_tree.current_depth, COLOR_U);
-        cbf_clear(&split_cu->cbf, split_tree.current_depth, COLOR_V);
+        cbf_clear(&split_cu->cbf, COLOR_Y);
+        cbf_clear(&split_cu->cbf, COLOR_U);
+        cbf_clear(&split_cu->cbf, COLOR_V);
         if (cur_cbf) {
-          cbf_set(&split_cu->cbf, split_tree.current_depth, COLOR_Y);
+          cbf_set(&split_cu->cbf, COLOR_Y);
         }
-        if(cbf_cb) cbf_set(&split_cu->cbf, split_tree.current_depth, COLOR_U);
-        if(cbf_cr) cbf_set(&split_cu->cbf, split_tree.current_depth, COLOR_V);
+        if(cbf_cb) cbf_set(&split_cu->cbf, COLOR_U);
+        if(cbf_cr) cbf_set(&split_cu->cbf, COLOR_V);
         split_cu->joint_cb_cr = jccr;
       }
       lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu);
@@ -1241,7 +1241,7 @@ static double search_cu(
                                   false,
           tree_type);
 
-        int cbf = cbf_is_set_any(cur_cu->cbf, split_tree.current_depth);
+        int cbf = cbf_is_set_any(cur_cu->cbf);
 
         if (cur_cu->merged && !cbf) {
           cur_cu->merged = 0;
@@ -1327,7 +1327,7 @@ static double search_cu(
 
     int half_cu = cu_width >> (tree_type != UVG_CHROMA_T);
     double split_cost = 0.0;
-    int cbf = cbf_is_set_any(cur_cu->cbf, split_tree.current_depth);
+    int cbf = cbf_is_set_any(cur_cu->cbf);
     cabac_data_t post_seach_cabac;
     memcpy(&post_seach_cabac, &state->search_cabac, sizeof(post_seach_cabac));
     memcpy(&state->search_cabac, &pre_search_cabac, sizeof(post_seach_cabac));
diff --git a/src/search_inter.c b/src/search_inter.c
index 46b04349..b6071a8d 100644
--- a/src/search_inter.c
+++ b/src/search_inter.c
@@ -1816,7 +1816,7 @@ static void search_pu_inter(
 
         uvg_quantize_lcu_residual(state, true, false, false, cu_loc, depth, cur_pu, lcu, true, UVG_BOTH_T);
 
-        if (cbf_is_set(cur_pu->cbf, depth, COLOR_Y)) {
+        if (cbf_is_set(cur_pu->cbf, COLOR_Y)) {
           continue;
         }
         else if (has_chroma) {
@@ -1827,7 +1827,7 @@ static void search_pu_inter(
                                     cu_loc, depth, cur_pu, lcu,
                                     true,
             UVG_BOTH_T);
-          if (!cbf_is_set_any(cur_pu->cbf, depth)) {
+          if (!cbf_is_set_any(cur_pu->cbf)) {
             cur_pu->type = CU_INTER;
             cur_pu->merge_idx = merge_idx;
             cur_pu->skipped = true;
@@ -2228,20 +2228,20 @@ void uvg_cu_cost_inter_rd2(
       v_resi,
       &chorma_ts_out,
       UVG_BOTH_T);
-    cbf_clear(&cur_cu->cbf, depth, COLOR_U);
-    cbf_clear(&cur_cu->cbf, depth, COLOR_V);
+    cbf_clear(&cur_cu->cbf, COLOR_U);
+    cbf_clear(&cur_cu->cbf, COLOR_V);
     if (chorma_ts_out.best_u_cost + chorma_ts_out.best_v_cost < chorma_ts_out.best_combined_cost) {
       cur_cu->joint_cb_cr = 0;
       cur_cu->tr_skip |= (chorma_ts_out.best_u_index == CHROMA_TS) << COLOR_U;
       cur_cu->tr_skip |= (chorma_ts_out.best_v_index == CHROMA_TS) << COLOR_V;
-      if(chorma_ts_out.best_u_index != NO_RESIDUAL) cbf_set(&cur_cu->cbf, depth, COLOR_U);
-      if(chorma_ts_out.best_v_index != NO_RESIDUAL) cbf_set(&cur_cu->cbf, depth, COLOR_V);
+      if(chorma_ts_out.best_u_index != NO_RESIDUAL) cbf_set(&cur_cu->cbf, COLOR_U);
+      if(chorma_ts_out.best_v_index != NO_RESIDUAL) cbf_set(&cur_cu->cbf, COLOR_V);
       chroma_cost += chorma_ts_out.best_u_cost + chorma_ts_out.best_v_cost;
     }
     else {
       cur_cu->joint_cb_cr = chorma_ts_out.best_combined_index;
-      if (chorma_ts_out.best_combined_index & 2) cbf_set(&cur_cu->cbf, depth, COLOR_U);
-      if (chorma_ts_out.best_combined_index & 1) cbf_set(&cur_cu->cbf, depth, COLOR_V);
+      if (chorma_ts_out.best_combined_index & 2) cbf_set(&cur_cu->cbf, COLOR_U);
+      if (chorma_ts_out.best_combined_index & 1) cbf_set(&cur_cu->cbf, COLOR_V);
       chroma_cost += chorma_ts_out.best_combined_cost;
     }
   }
@@ -2257,7 +2257,7 @@ void uvg_cu_cost_inter_rd2(
       UVG_BOTH_T);    
   }
 
-  int cbf = cbf_is_set_any(cur_cu->cbf, depth);
+  int cbf = cbf_is_set_any(cur_cu->cbf);
   
   if(cbf) {
     *inter_cost = uvg_cu_rd_cost_luma(state, cu_loc, cur_cu, lcu, 0);
diff --git a/src/search_intra.c b/src/search_intra.c
index 843836bf..61da97ef 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -304,10 +304,10 @@ static double search_intra_trdepth(
 
     nosplit_cost = 0.0;
 
-    cbf_clear(&pred_cu->cbf, depth, COLOR_Y);
+    cbf_clear(&pred_cu->cbf, COLOR_Y);
     if (reconstruct_chroma) {
-      cbf_clear(&pred_cu->cbf, depth, COLOR_U);
-      cbf_clear(&pred_cu->cbf, depth, COLOR_V);
+      cbf_clear(&pred_cu->cbf, COLOR_U);
+      cbf_clear(&pred_cu->cbf, COLOR_V);
     }
 
     const int8_t chroma_mode = reconstruct_chroma ? (!pred_cu->intra.mip_flag ? pred_cu->intra.mode : 0) : -1;
@@ -398,7 +398,7 @@ static double search_intra_trdepth(
           );
         if (pred_cu->intra.isp_mode != ISP_MODE_NO_ISP && search_data->best_isp_cbfs == 0) continue;
 
-        if (trafo != 0 && !cbf_is_set(pred_cu->cbf, depth, COLOR_Y)) continue;
+        if (trafo != 0 && !cbf_is_set(pred_cu->cbf, COLOR_Y)) continue;
         
         derive_mts_constraints(pred_cu, lcu, width, height, lcu_px);
         if (pred_cu->tr_idx > 1) {
@@ -424,7 +424,7 @@ static double search_intra_trdepth(
             COLOR_Y);
         }
 
-        if (!constraints[1] && cbf_is_set(pred_cu->cbf, depth, COLOR_Y)) {
+        if (!constraints[1] && cbf_is_set(pred_cu->cbf, COLOR_Y)) {
           //end_idx = 0;
           if (pred_cu->lfnst_idx > 0) {
             continue;
@@ -1347,7 +1347,7 @@ static int8_t search_intra_rdo(
         best_isp_mode = isp_mode;
         best_bits = search_data[mode].bits;
       }
-      if (state->encoder_control->cfg.intra_rdo_et && !cbf_is_set_any(search_data[mode].pred_cu.cbf, depth)) {
+      if (state->encoder_control->cfg.intra_rdo_et && !cbf_is_set_any(search_data[mode].pred_cu.cbf)) {
         modes_to_check = mode + 1;
         break;
       }
diff --git a/src/transform.c b/src/transform.c
index 2a532715..1886d91b 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -446,7 +446,7 @@ static void quantize_chroma(
 
     if (transforms[i] == DCT7_CHROMA) {
       uint16_t temp_cbf = 0;
-      if (*u_has_coeffs)cbf_set(&temp_cbf, depth, COLOR_U);
+      if (*u_has_coeffs)cbf_set(&temp_cbf, COLOR_U);
       uvg_rdoq(state, &v_coeff[i * trans_offset], v_quant_coeff, width, height, COLOR_V,
                scan_order, CU_INTRA, temp_cbf, lfnst_idx);
 
@@ -1289,7 +1289,7 @@ static void quantize_tr_residual(
         for (int j = 0; j < tr_height; ++j) {
           memcpy(&dst_coeff[j * lcu_width], &coeff[j * tr_width], tr_width * sizeof(coeff_t));
         }
-        cbf_set(&cur_pu->cbf, depth, color);
+        cbf_set(&cur_pu->cbf, color);
       }
       else {
         for (int j = 0; j < tr_height; ++j) {
@@ -1318,13 +1318,12 @@ static void quantize_tr_residual(
     
   }
 
-  // ISP_TODO: does this cu point to correct cbf when ISP is used for small blocks?
-  cbf_clear(&cur_pu->cbf, depth, color);
+  cbf_clear(&cur_pu->cbf, color);
   if (has_coeffs) {
     for (int j = 0; j < tr_height; ++j) {
       memcpy(&dst_coeff[j * lcu_width], &coeff[j * tr_width], tr_width * sizeof(coeff_t));
     }
-    cbf_set(&cur_pu->cbf, depth, color);
+    cbf_set(&cur_pu->cbf, color);
   }
   else {
     for (int j = 0; j < tr_height; ++j) {
@@ -1387,11 +1386,11 @@ void uvg_quantize_lcu_residual(
   // for depth earlier
   // ISP_TODO: does this cur_cu point to the correct place when ISP is used for small blocks?
   if (luma) {
-    cbf_clear(&cur_pu->cbf, depth, COLOR_Y);
+    cbf_clear(&cur_pu->cbf, COLOR_Y);
   }
   if (chroma || jccr) {
-    cbf_clear(&cur_pu->cbf, depth, COLOR_U);
-    cbf_clear(&cur_pu->cbf, depth, COLOR_V);
+    cbf_clear(&cur_pu->cbf, COLOR_U);
+    cbf_clear(&cur_pu->cbf, COLOR_V);
   }
 
   if (depth == 0 || cur_pu->tr_depth > depth) {
@@ -1423,9 +1422,10 @@ void uvg_quantize_lcu_residual(
     };
 
     if (depth <= MAX_DEPTH) {
-      cbf_set_conditionally(&cur_pu->cbf, child_cbfs, depth, COLOR_Y);
-      cbf_set_conditionally(&cur_pu->cbf, child_cbfs, depth, COLOR_U);
-      cbf_set_conditionally(&cur_pu->cbf, child_cbfs, depth, COLOR_V);
+      cur_pu->root_cbf = cbf_is_set_any(cur_pu->cbf)
+      || cbf_is_set_any(child_cbfs[0])
+      || cbf_is_set_any(child_cbfs[1])
+      || cbf_is_set_any(child_cbfs[2]);
     }
 
   } else {

From b14f6f98ec38936bd417827967e179505fbbf614 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Thu, 15 Sep 2022 14:00:08 +0300
Subject: [PATCH 098/254] [mtt] Completely remove tr_depth

---
 src/cfg.c                              |  9 ----
 src/cu.h                               |  5 +-
 src/encode_coding_tree.c               | 71 +++++---------------------
 src/encoder.c                          |  6 +--
 src/encoder.h                          |  2 -
 src/encoderstate.c                     |  4 +-
 src/filter.c                           |  1 -
 src/intra.c                            |  2 +-
 src/search.c                           | 53 +++----------------
 src/search.h                           |  7 ---
 src/search_inter.c                     |  8 +--
 src/search_intra.c                     | 28 +++-------
 src/strategies/avx2/quant-avx2.c       |  1 -
 src/strategies/generic/quant-generic.c |  2 -
 src/transform.c                        | 13 +++--
 src/uvg266.h                           |  1 -
 16 files changed, 42 insertions(+), 171 deletions(-)

diff --git a/src/cfg.c b/src/cfg.c
index f5763be8..f2073da5 100644
--- a/src/cfg.c
+++ b/src/cfg.c
@@ -80,7 +80,6 @@ int uvg_config_init(uvg_config *cfg)
   cfg->trskip_max_size = 2; //Default to 4x4
   cfg->mts             = 0;
   cfg->mts_implicit    = 0;
-  cfg->tr_depth_intra  = 0;
   cfg->ime_algorithm   = 0; /* hexbs */
   cfg->fme_level       = 4;
   cfg->source_scan_type = 0; /* progressive */
@@ -930,8 +929,6 @@ int uvg_config_parse(uvg_config *cfg, const char *name, const char *value)
     cfg->mts = mts_type;
     cfg->mts_implicit = (mts_type == UVG_MTS_IMPLICIT);
   }
-  else if OPT("tr-depth-intra")
-    cfg->tr_depth_intra = atoi(value);
   else if OPT("me") {
     int8_t ime_algorithm = 0;
     if (!parse_enum(value, me_names, &ime_algorithm)) return 0;
@@ -1686,12 +1683,6 @@ int uvg_config_validate(const uvg_config *const cfg)
     error = 1;
   }
 
-  if (cfg->tr_depth_intra < 0 || cfg->tr_depth_intra > 4) {
-    // range is 0 .. CtbLog2SizeY - Log2MinTrafoSize
-    fprintf(stderr, "Input error: --tr-depth-intra is out of range [0..4]\n");
-    error = 1;
-  }
-
   if (cfg->fme_level != 0 && cfg->fme_level > 4) {
     fprintf(stderr, "Input error: invalid --subme parameter (must be in range 0-4)\n");
     error = 1;
diff --git a/src/cu.h b/src/cu.h
index 77965072..4a2e416c 100644
--- a/src/cu.h
+++ b/src/cu.h
@@ -119,7 +119,6 @@ typedef struct
 {
   uint8_t type        : 3; //!< \brief block type, one of cu_type_t values
   uint8_t depth       : 3; //!< \brief depth / size of this block
-  uint8_t tr_depth    : 3; //!< \brief transform depth
   uint8_t skipped     : 1; //!< \brief flag to indicate this block is skipped
   uint8_t merged      : 1; //!< \brief flag to indicate this block is merged
   uint8_t merge_idx   : 3; //!< \brief merge index
@@ -200,7 +199,7 @@ void uvg_cu_loc_ctor(cu_loc_t *loc, int x, int y, int width, int height);
     } \
   } while (0)
 
-#define CHECKPOINT_CU(prefix_str, cu) CHECKPOINT(prefix_str " type=%d depth=%d part_size=%d tr_depth=%d coded=%d " \
+#define CHECKPOINT_CU(prefix_str, cu) CHECKPOINT(prefix_str " type=%d depth=%d part_size=%d coded=%d " \
   "skipped=%d merged=%d merge_idx=%d cbf.y=%d cbf.u=%d cbf.v=%d " \
   "intra[0].cost=%u intra[0].bitcost=%u intra[0].mode=%d intra[0].mode_chroma=%d intra[0].tr_skip=%d " \
   "intra[1].cost=%u intra[1].bitcost=%u intra[1].mode=%d intra[1].mode_chroma=%d intra[1].tr_skip=%d " \
@@ -208,7 +207,7 @@ void uvg_cu_loc_ctor(cu_loc_t *loc, int x, int y, int width, int height);
   "intra[3].cost=%u intra[3].bitcost=%u intra[3].mode=%d intra[3].mode_chroma=%d intra[3].tr_skip=%d " \
   "inter.cost=%u inter.bitcost=%u inter.mv[0]=%d inter.mv[1]=%d inter.mvd[0]=%d inter.mvd[1]=%d " \
   "inter.mv_cand=%d inter.mv_ref=%d inter.mv_dir=%d inter.mode=%d" \
-  , (cu).type, (cu).depth, (cu).part_size, (cu).tr_depth, (cu).coded, \
+  , (cu).type, (cu).depth, (cu).part_size, (cu).coded, \
   (cu).skipped, (cu).merged, (cu).merge_idx, (cu).cbf.y, (cu).cbf.u, (cu).cbf.v, \
   (cu).intra[0].cost, (cu).intra[0].bitcost, (cu).intra[0].mode, (cu).intra[0].mode_chroma, (cu).intra[0].tr_skip, \
   (cu).intra[1].cost, (cu).intra[1].bitcost, (cu).intra[1].mode, (cu).intra[1].mode_chroma, (cu).intra[1].tr_skip, \
diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index c7a314b9..1c7af8d7 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -587,7 +587,6 @@ static void encode_transform_unit(
  * \param x_pu            Prediction units' x coordinate.
  * \param y_pu            Prediction units' y coordinate.
  * \param depth           Depth from LCU.
- * \param tr_depth        Depth from last CU.
  * \param parent_coeff_u  What was signaled at previous level for cbf_cb.
  * \param parent_coeff_v  What was signlaed at previous level for cbf_cr.
  */
@@ -595,7 +594,6 @@ static void encode_transform_coeff(
   encoder_state_t * const state,
   const cu_loc_t * cu_loc,
   int8_t depth,
-  int8_t tr_depth,
   bool only_chroma,
   lcu_coeff_t* coeff,
   enum uvg_tree_type tree_type,
@@ -626,45 +624,13 @@ static void encode_transform_coeff(
   const int x_cu = 8 * (x / 8);
   const int y_cu = 8 * (y / 8);
   const cu_info_t *cur_cu = uvg_cu_array_at_const(used_array, x, y); // TODO: very suspect, chroma cbfs stored in upper left corner, everything else in bottom right for depth 4
-
-  // NxN signifies implicit transform split at the first transform level.
-  // There is a similar implicit split for inter, but it is only used when
-  // transform hierarchy is not in use.
-  //int intra_split_flag = (cur_cu->type == CU_INTRA && cur_cu->part_size == SIZE_NxN);
-
-  // The implicit split by intra NxN is not counted towards max_tr_depth.
-  /*
-  int max_tr_depth;
-  if (cur_cu->type == CU_INTRA) {
-    max_tr_depth = ctrl->cfg.tr_depth_intra + intra_split_flag;
-  } else {
-    max_tr_depth = ctrl->tr_depth_inter;
-  }
-  */
-
+  
   int8_t split = (cu_loc->width > TR_MAX_WIDTH || cu_loc->height > TR_MAX_WIDTH);
 
   const int cb_flag_y = tree_type != UVG_CHROMA_T ? cbf_is_set(cur_pu->cbf, COLOR_Y) : 0;
   const int cb_flag_u = tree_type != UVG_LUMA_T ?( cur_pu->joint_cb_cr ? (cur_pu->joint_cb_cr >> 1) & 1 : cbf_is_set(cur_cu->cbf, COLOR_U)) : 0;
   const int cb_flag_v = tree_type != UVG_LUMA_T ? (cur_pu->joint_cb_cr ? cur_pu->joint_cb_cr & 1 : cbf_is_set(cur_cu->cbf, COLOR_V)) : 0;
 
-  // The split_transform_flag is not signaled when:
-  // - transform size is greater than 32 (depth == 0)
-  // - transform size is 4 (depth == MAX_PU_DEPTH)
-  // - transform depth is max
-  // - cu is intra NxN and it's the first split
-  
-  //ToDo: check BMS transform split in QTBT
-  /*
-  if (depth > 0 &&
-      depth < MAX_PU_DEPTH &&
-      tr_depth < max_tr_depth &&
-      !(intra_split_flag && tr_depth == 0))
-  {
-    cabac->cur_ctx = &(cabac->ctx.trans_subdiv_model[5 - ((uvg_g_convert_to_bit[LCU_WIDTH] + 2) - depth)]);
-    CABAC_BIN(cabac, split, "split_transform_flag");
-  }
-  */
 
   if (split) {
     int split_width  = width >> 1;
@@ -675,29 +641,20 @@ static void encode_transform_coeff(
         cu_loc_t loc;
         uvg_cu_loc_ctor(&loc, (x + i * split_width), (y + j * split_height), width >> 1, height >> 1);
 
-        encode_transform_coeff(state, &loc, depth + 1, tr_depth + 1, only_chroma, coeff, tree_type, true, luma_cbf_ctx, &loc);
+        encode_transform_coeff(state, &loc, depth + 1, only_chroma, coeff, tree_type, true, luma_cbf_ctx, &loc);
       }
     }
     return;
   }
   // Chroma cb flags are not signaled when one of the following:
-  // - transform size is 4 (2x2 chroma transform doesn't exist)
-  // - they have already been signaled to 0 previously
-  // When they are not present they are inferred to be 0, except for size 4
-  // when the flags from previous level are used.
+  // No chroma.
+  // Not the last CU for area of 64 pixels cowered by more than one luma CU.
+  // Not the last ISP Split
   if (state->encoder_control->chroma_format != UVG_CSP_400 && (depth != 4 || only_chroma) && tree_type != UVG_LUMA_T && last_split) {
-
-    if (!split) {
-      if (true) {
-        assert(tr_depth < 5);
-        cabac->cur_ctx = &(cabac->ctx.qt_cbf_model_cb[0]);
-        CABAC_BIN(cabac, cb_flag_u, "cbf_cb");
-      }
-      if (true) {
-        cabac->cur_ctx = &(cabac->ctx.qt_cbf_model_cr[cb_flag_u ? 1 : 0]);
-        CABAC_BIN(cabac, cb_flag_v, "cbf_cr");
-      }
-    }
+    cabac->cur_ctx = &(cabac->ctx.qt_cbf_model_cb[0]);
+    CABAC_BIN(cabac, cb_flag_u, "cbf_cb");
+    cabac->cur_ctx = &(cabac->ctx.qt_cbf_model_cr[cb_flag_u ? 1 : 0]);
+    CABAC_BIN(cabac, cb_flag_v, "cbf_cr");    
   }
 
 
@@ -717,13 +674,13 @@ static void encode_transform_coeff(
   // - transform depth > 0
   // - we have chroma coefficients at this level
   // When it is not present, it is inferred to be 1.
-  if ((cur_cu->type == CU_INTRA || tr_depth > 0 || cb_flag_u || cb_flag_v) && !only_chroma && tree_type != UVG_CHROMA_T) {
+  if ((cur_cu->type == CU_INTRA || !PU_IS_TU(cur_cu) || cb_flag_u || cb_flag_v) && !only_chroma && tree_type != UVG_CHROMA_T) {
     if (can_skip_last_cbf && isp_split && last_split) {
       // Do not write luma cbf if first three isp splits have luma cbf 0
     } else {
       cabac->cur_ctx = &(cabac->ctx.qt_cbf_model_luma[*luma_cbf_ctx]);
       CABAC_BIN(cabac, cb_flag_y, "cbf_luma");
-      if (tr_depth == 0) {
+      if (PU_IS_TU(cur_cu)) {
         *luma_cbf_ctx = 2 + cb_flag_y;
       }
     }
@@ -1620,7 +1577,7 @@ void uvg_encode_coding_tree(
       // Code (possible) coeffs to bitstream
       if (has_coeffs) {
         int luma_cbf_ctx = 0;
-        encode_transform_coeff(state, cu_loc, depth, 0, 0, 0, 0, coeff, tree_type, true, false, &luma_cbf_ctx, cu_loc);
+        encode_transform_coeff(state, cu_loc, depth, 0, coeff, tree_type, true, false, &luma_cbf_ctx, cu_loc);
       }
 
       encode_mts_idx(state, cabac, cur_cu, cu_loc);
@@ -1654,7 +1611,7 @@ void uvg_encode_coding_tree(
 
         // Check if last split to write chroma
         bool last_split = (i + 1) == split_limit;
-        encode_transform_coeff(state, &split_loc, depth, 0, 0, coeff, tree_type, last_split, can_skip_last_cbf, &luma_cbf_ctx, cu_loc);
+        encode_transform_coeff(state, &split_loc, depth, 0, coeff, tree_type, last_split, can_skip_last_cbf, &luma_cbf_ctx, cu_loc);
         can_skip_last_cbf &= luma_cbf_ctx == 2;
       }
     }
@@ -1674,7 +1631,7 @@ void uvg_encode_coding_tree(
       tmp->violates_lfnst_constrained_luma = false;
       tmp->violates_lfnst_constrained_chroma = false;
       tmp->lfnst_last_scan_pos = false;
-      encode_transform_coeff(state, &cu_loc, depth, 0, 1, coeff, tree_type, true, false, &luma_cbf_ctx, cu_loc);
+      encode_transform_coeff(state, cu_loc, depth, 1, coeff, tree_type, true, false, &luma_cbf_ctx, cu_loc);
       // Write LFNST only once for single tree structure
       encode_lfnst_idx(state, cabac, tmp, is_local_dual_tree ? UVG_CHROMA_T : tree_type, COLOR_UV, cu_loc);
     }
diff --git a/src/encoder.c b/src/encoder.c
index d0121037..f3d7653a 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -378,11 +378,7 @@ encoder_control_t* uvg_encoder_control_init(const uvg_config *const cfg)
   {
     goto init_failed;
   }
-
-  // NOTE: When tr_depth_inter is equal to 0, the transform is still split
-  // for SMP and AMP partition units.
-  encoder->tr_depth_inter = 0;
-
+  
   //Tiles
   encoder->tiles_enable = encoder->cfg.tiles_width_count > 1 ||
                           encoder->cfg.tiles_height_count > 1;
diff --git a/src/encoder.h b/src/encoder.h
index 0fb46e1b..be835890 100644
--- a/src/encoder.h
+++ b/src/encoder.h
@@ -132,8 +132,6 @@ typedef struct encoder_control_t
 
   FILE *roi_file;
 
-  int tr_depth_inter;
-
   //! pic_parameter_set
   struct {
     uint8_t dependent_slice_segments_enabled_flag;
diff --git a/src/encoderstate.c b/src/encoderstate.c
index f2db175f..6557a75f 100644
--- a/src/encoderstate.c
+++ b/src/encoderstate.c
@@ -660,10 +660,10 @@ static void set_cu_qps(encoder_state_t *state, const cu_loc_t* const cu_loc, int
 
     int y_limit = cu_loc->y + cu_loc->height;
     int x_limit = cu_loc->x + cu_loc->width;
-    if (cu->tr_depth > depth) {
+    if (cu_loc->width > TR_MAX_WIDTH || cu_loc->height > TR_MAX_WIDTH) {
       // The CU is split into smaller transform units. Check whether coded
       // block flag is set for any of the TUs.
-      const int tu_width = LCU_WIDTH >> cu->tr_depth;
+      const int tu_width = MIN(TR_MAX_WIDTH, 1 << cu->log2_width);
       for (int y_scu = cu_loc->y; !cbf_found && y_scu < y_limit; y_scu += tu_width) {
         for (int x_scu = cu_loc->x; !cbf_found && x_scu < x_limit; x_scu += tu_width) {
           cu_info_t *tu = uvg_cu_array_at(state->tile->frame->cu_array, x_scu, y_scu);
diff --git a/src/filter.c b/src/filter.c
index c0ee7c68..10d135a1 100644
--- a/src/filter.c
+++ b/src/filter.c
@@ -767,7 +767,6 @@ static void filter_deblock_edge_luma(encoder_state_t * const state,
         }
         else if (tu_boundary && nonzero_coeffs) {
           // Non-zero residual/coeffs and transform boundary
-          // Neither CU is intra so tr_depth <= MAX_DEPTH.
           strength = 1;
         }
         else if(cu_p->inter.mv_dir == 3 || cu_q->inter.mv_dir == 3 || state->frame->slicetype == UVG_SLICE_B) { // B-slice related checks. TODO: Need to account for cu_p being in another slice?
diff --git a/src/intra.c b/src/intra.c
index 6fb96aea..c38544b5 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -1553,7 +1553,7 @@ void uvg_intra_predict(
   }
   else {
     uvg_pixels_blit(&state->tile->frame->cclm_luma_rec[x / 2 + (y * stride) / 4], dst, width, width, stride / 2, width);
-    if (data->pred_cu.depth != data->pred_cu.tr_depth || data->cclm_parameters[color == COLOR_U ? 0 : 1].b <= 0) {
+    if (!PU_IS_TU(&data->pred_cu) || data->cclm_parameters[color == COLOR_U ? 0 : 1].b <= 0) {
       predict_cclm(
         state, color, width, height, x, y, stride, intra_mode, lcu, refs, dst, 
         (cclm_parameters_t*)&data->cclm_parameters[color == COLOR_U ? 0 : 1],
diff --git a/src/search.c b/src/search.c
index b741cbc7..6852c961 100644
--- a/src/search.c
+++ b/src/search.c
@@ -158,25 +158,6 @@ static void work_tree_copy_down(
   }
 }
 
-void uvg_lcu_fill_trdepth(
-  lcu_t *lcu,
-  const cu_loc_t* const cu_loc,
-  uint8_t tr_depth,
-  enum uvg_tree_type
-  tree_type)
-{
-  const int x_local = cu_loc->local_x >> (tree_type == UVG_CHROMA_T);
-  const int y_local = cu_loc->local_y >> (tree_type == UVG_CHROMA_T);
-  const unsigned width = tree_type != UVG_CHROMA_T ? cu_loc->width  : cu_loc->chroma_width;
-  const unsigned height = tree_type != UVG_CHROMA_T ? cu_loc->height : cu_loc->chroma_height;
-
-  for (unsigned y = 0; y < height; y += SCU_WIDTH) {
-    for (unsigned x = 0; x < width; x += SCU_WIDTH) {
-      LCU_GET_CU_AT_PX(lcu, x_local + x, y_local + y)->tr_depth = tr_depth;
-    }
-  }
-}
-
 static void lcu_fill_cu_info(lcu_t *lcu, int x_local, int y_local, int width, int height, const cu_info_t *cu)
 {
   // Set mode in every CU covered by part_mode in this depth.
@@ -216,8 +197,7 @@ static void lcu_fill_cu_info(lcu_t *lcu, int x_local, int y_local, int width, in
 
 static void lcu_fill_cbf(lcu_t *lcu, int x_local, unsigned y_local, unsigned width, const cu_info_t *cur_cu)
 {
-  const uint32_t tr_split = cur_cu->tr_depth - cur_cu->depth;
-  const uint32_t mask = ~((width >> tr_split)-1);
+  const uint32_t mask = ~((MIN(width, TR_MAX_WIDTH))-1);
 
   // Set coeff flags in every CU covered by part_mode in this depth.
   for (uint32_t y = y_local; y < y_local + width; y += SCU_WIDTH) {
@@ -360,10 +340,9 @@ double uvg_cu_rd_cost_luma(
   // Add transform_tree cbf_luma bit cost.
   if (pred_cu->type == CU_INTER || pred_cu->intra.isp_mode == ISP_MODE_NO_ISP) {
     const int depth = 6 - uvg_g_convert_to_log2[cu_loc->width];
-    const int is_tr_split = tr_cu->tr_depth - tr_cu->depth;
     int is_set = cbf_is_set(pred_cu->cbf, COLOR_Y);
     if (pred_cu->type == CU_INTRA ||
-      is_tr_split ||
+      !PU_IS_TU(pred_cu) ||
       cbf_is_set(tr_cu->cbf, COLOR_U) ||
       cbf_is_set(tr_cu->cbf, COLOR_V))
     {
@@ -482,13 +461,11 @@ double uvg_cu_rd_cost_chroma(
     cabac_data_t* cabac = (cabac_data_t*)&state->search_cabac;
     cabac_ctx_t* ctx = &(cabac->ctx.qt_cbf_model_cb[0]);
     cabac->cur_ctx = ctx;
-    if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, COLOR_U)) {
-      CABAC_FBITS_UPDATE(cabac, ctx, u_is_set, tr_tree_bits, "cbf_cb_search");
-    }
+    CABAC_FBITS_UPDATE(cabac, ctx, u_is_set, tr_tree_bits, "cbf_cb_search");
+    
     ctx = &(cabac->ctx.qt_cbf_model_cr[u_is_set]);
-    if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, COLOR_V)) {
-      CABAC_FBITS_UPDATE(cabac, ctx, v_is_set, tr_tree_bits, "cbf_cb_search");
-    }
+    CABAC_FBITS_UPDATE(cabac, ctx, v_is_set, tr_tree_bits, "cbf_cb_search");
+    
   }
 
 
@@ -969,7 +946,6 @@ static double search_cu(
   cur_cu = LCU_GET_CU_AT_PX(lcu, x_local, y_local);
   // Assign correct depth
   cur_cu->depth = (split_tree.current_depth > MAX_DEPTH) ? MAX_DEPTH : split_tree.current_depth;
-  cur_cu->tr_depth = cu_width > TR_MAX_WIDTH || cu_height > TR_MAX_WIDTH ? 1 : split_tree.current_depth;
   cur_cu->type = CU_NOTSET;
   cur_cu->qp = state->qp;
   cur_cu->bdpcmMode = 0;
@@ -1112,9 +1088,6 @@ static double search_cu(
           intra_search.pred_cu.intra.mode_chroma = intra_mode;
         }
         intra_search.pred_cu.intra.mode = intra_mode;
-        if(tree_type == UVG_CHROMA_T) {
-          uvg_lcu_fill_trdepth(lcu, cu_loc, split_tree.current_depth, tree_type);
-        }
       }
       if (intra_cost < cost) {
         cost = intra_cost;
@@ -1216,11 +1189,6 @@ static double search_cu(
             if (cur_cu->inter.mv_dir & 1) uvg_round_precision(INTERNAL_MV_PREC, 2, &cur_cu->inter.mv[0][0], &cur_cu->inter.mv[0][1]);
             if (cur_cu->inter.mv_dir & 2) uvg_round_precision(INTERNAL_MV_PREC, 2, &cur_cu->inter.mv[1][0], &cur_cu->inter.mv[1][1]);
         }
-        // Reset transform depth because intra messes with them.
-        // This will no longer be necessary if the transform depths are not shared.
-        int tr_depth = MAX(1, split_tree.current_depth);
-
-        uvg_lcu_fill_trdepth(lcu, cu_loc, tr_depth, tree_type);
 
         const bool has_chroma = state->encoder_control->chroma_format != UVG_CSP_400;
         uvg_inter_recon_cu(state, lcu, true, has_chroma, cu_loc);
@@ -1296,12 +1264,6 @@ static double search_cu(
         lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu);
       }
 
-      if (cur_cu->tr_depth != 0) {
-        // Reset transform depth since there are no coefficients. This
-        // ensures that CBF is cleared for the whole area of the CU.
-        uvg_lcu_fill_trdepth(lcu, cu_loc, depth, tree_type);
-      }
-
       cur_cu->cbf = 0;
       lcu_fill_cbf(lcu, x_local, y_local, cu_width, cur_cu);
     }
@@ -1430,8 +1392,7 @@ static double search_cu(
         cur_cu->intra.multi_ref_idx = 0;
         cur_cu->lfnst_idx = 0;
         cur_cu->cr_lfnst_idx = 0;
-
-        uvg_lcu_fill_trdepth(lcu, cu_loc, cur_cu->tr_depth, tree_type);
+        
         lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu);
         
         intra_search_data_t proxy;
diff --git a/src/search.h b/src/search.h
index 73c7efec..af2ae2a7 100644
--- a/src/search.h
+++ b/src/search.h
@@ -96,13 +96,6 @@ double uvg_cu_rd_cost_chroma(
   lcu_t *const lcu,
   const cu_loc_t * const);
 
-void uvg_lcu_fill_trdepth(
-  lcu_t *lcu,
-  const cu_loc_t* const cu_loc,
-  uint8_t tr_depth,
-  enum uvg_tree_type
-  tree_type);
-
 void uvg_intra_recon_lcu_luma(encoder_state_t * const state, int x, int y, int depth, int8_t intra_mode, cu_info_t *cur_cu, lcu_t *lcu);
 void uvg_intra_recon_lcu_chroma(encoder_state_t * const state, int x, int y, int depth, int8_t intra_mode, cu_info_t *cur_cu, lcu_t *lcu);
 
diff --git a/src/search_inter.c b/src/search_inter.c
index b6071a8d..9a4ac572 100644
--- a/src/search_inter.c
+++ b/src/search_inter.c
@@ -1811,7 +1811,6 @@ static void search_pu_inter(
         cur_pu->inter.mv[0][1]  = info->merge_cand[merge_idx].mv[0][1];
         cur_pu->inter.mv[1][0]  = info->merge_cand[merge_idx].mv[1][0];
         cur_pu->inter.mv[1][1]  = info->merge_cand[merge_idx].mv[1][1];
-        uvg_lcu_fill_trdepth(lcu, cu_loc, MAX(1, depth), UVG_BOTH_T);
         uvg_inter_recon_cu(state, lcu, true, false, cu_loc);
 
         uvg_quantize_lcu_residual(state, true, false, false, cu_loc, depth, cur_pu, lcu, true, UVG_BOTH_T);
@@ -2127,9 +2126,6 @@ void uvg_cu_cost_inter_rd2(
   const cu_loc_t* const cu_loc){
 
   const uint8_t depth = 6 - uvg_g_convert_to_log2[cu_loc->width];
-  int tr_depth = MAX(1, depth);
-
-  uvg_lcu_fill_trdepth(lcu, cu_loc, tr_depth, UVG_BOTH_T);
 
   const int x_px = SUB_SCU(cu_loc->x);
   const int y_px = SUB_SCU(cu_loc->y);
@@ -2179,7 +2175,7 @@ void uvg_cu_cost_inter_rd2(
     state->encoder_control->cfg.chroma_trskip_enable;
 
   double chroma_cost = 0;
-  if((state->encoder_control->cfg.jccr || can_use_chroma_tr_skip) && cur_cu->depth == cur_cu->tr_depth && reconstruct_chroma) {
+  if((state->encoder_control->cfg.jccr || can_use_chroma_tr_skip) && PU_IS_TU(cur_cu) && reconstruct_chroma) {
     uvg_quantize_lcu_residual(state,
                               true,
                               false,
@@ -2262,7 +2258,7 @@ void uvg_cu_cost_inter_rd2(
   if(cbf) {
     *inter_cost = uvg_cu_rd_cost_luma(state, cu_loc, cur_cu, lcu, 0);
     if (reconstruct_chroma) {
-      if (cur_cu->depth != cur_cu->tr_depth || !state->encoder_control->cfg.jccr) {
+      if (!PU_IS_TU(cur_cu) || !state->encoder_control->cfg.jccr) {
         *inter_cost += uvg_cu_rd_cost_chroma(state, cur_cu, lcu, cu_loc);
       }
       else {
diff --git a/src/search_intra.c b/src/search_intra.c
index 61da97ef..c90a2a5e 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -266,7 +266,6 @@ static void derive_mts_constraints(cu_info_t *const pred_cu,
 static double search_intra_trdepth(
   encoder_state_t * const state,
   const cu_loc_t* const cu_loc,
-  int max_depth,
   double cost_treshold,
   intra_search_data_t *const search_data,
   lcu_t *const lcu,
@@ -296,8 +295,6 @@ static double search_intra_trdepth(
   double nosplit_cost = INT32_MAX;
 
   if (width <= TR_MAX_WIDTH && height <= TR_MAX_WIDTH) {
-    tr_cu->tr_depth = depth;
-    pred_cu->tr_depth = depth;
 
     const bool mts_enabled = (state->encoder_control->cfg.mts == UVG_MTS_INTRA || state->encoder_control->cfg.mts == UVG_MTS_BOTH)
       && PU_IS_TU(pred_cu);
@@ -575,7 +572,7 @@ static double search_intra_trdepth(
   //   - Maximum transform hierarchy depth is constrained by clipping
   //     max_depth.
   // - Min transform size hasn't been reached (MAX_PU_DEPTH).
-  if (depth < max_depth && depth < MAX_PU_DEPTH) {
+  else {
     cu_loc_t split_cu_loc;
 
     const int half_width = width / 2;
@@ -583,28 +580,24 @@ static double search_intra_trdepth(
     split_cost = 0;
 
     uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x, cu_loc->y, half_width, half_height);
-    split_cost += search_intra_trdepth(state, &split_cu_loc, max_depth, nosplit_cost, search_data, lcu, tree_type);
+    split_cost += search_intra_trdepth(state, &split_cu_loc, nosplit_cost, search_data, lcu, tree_type);
     if (split_cost < nosplit_cost) {
       uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x + half_width, cu_loc->y, half_width, half_height);
-      split_cost += search_intra_trdepth(state, &split_cu_loc, max_depth, nosplit_cost, search_data, lcu, tree_type);
+      split_cost += search_intra_trdepth(state, &split_cu_loc, nosplit_cost, search_data, lcu, tree_type);
     }
     if (split_cost < nosplit_cost) {
       uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x, cu_loc->y + half_height, half_width, half_height);
-      split_cost += search_intra_trdepth(state, &split_cu_loc, max_depth, nosplit_cost, search_data, lcu, tree_type);
+      split_cost += search_intra_trdepth(state, &split_cu_loc, nosplit_cost, search_data, lcu, tree_type);
     }
     if (split_cost < nosplit_cost) {
       uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x + half_width, cu_loc->y + half_height, half_width, half_height);
-      split_cost += search_intra_trdepth(state, &split_cu_loc, max_depth, nosplit_cost, search_data, lcu, tree_type);
+      split_cost += search_intra_trdepth(state, &split_cu_loc, nosplit_cost, search_data, lcu, tree_type);
     }
-    
-  } else {
-    assert(width <= TR_MAX_WIDTH);
   }
 
   if (depth == 0 || split_cost < nosplit_cost) {
     return split_cost;
   } else {
-    uvg_lcu_fill_trdepth(lcu, cu_loc, depth, tree_type);
     return nosplit_cost;
   }
 }
@@ -1314,7 +1307,6 @@ static int8_t search_intra_rdo(
   const cu_loc_t* const cu_loc)
 {
   const int8_t depth = 6 - uvg_g_convert_to_log2[cu_loc->width];
-  const int tr_depth = CLIP(1, MAX_PU_DEPTH, depth + state->encoder_control->cfg.tr_depth_intra);
   const int width = cu_loc->width;
   const int height = cu_loc->height; // TODO: height for non-square blocks
   
@@ -1338,7 +1330,7 @@ static int8_t search_intra_rdo(
       search_data[mode].bits = rdo_bitcost;
       search_data[mode].cost = rdo_bitcost * state->lambda;
 
-      double mode_cost = search_intra_trdepth(state, cu_loc, tr_depth, MAX_INT, &search_data[mode], lcu, tree_type);
+      double mode_cost = search_intra_trdepth(state, cu_loc, MAX_INT, &search_data[mode], lcu, tree_type);
       best_mts_mode_for_isp[isp_mode] = search_data[mode].pred_cu.tr_idx;
       best_lfnst_mode_for_isp[isp_mode] = search_data[mode].pred_cu.lfnst_idx;
       search_data[mode].cost += mode_cost;
@@ -1492,7 +1484,7 @@ int8_t uvg_search_intra_chroma_rdo(
         }
         pred_cu->cr_lfnst_idx = lfnst;
         chroma_data[mode_i].lfnst_costs[lfnst] += mode_bits * state->lambda;
-        if (pred_cu->tr_depth == pred_cu->depth) {
+        if (PU_IS_TU(pred_cu)) {
           uvg_intra_predict(
             state,
             &refs[COLOR_U - 1],
@@ -1898,12 +1890,6 @@ void uvg_search_cu_intra(
     number_of_modes += num_mip_modes;
   }
 
-
-  // Set transform depth to current depth, meaning no transform splits.
-  {
-    const int8_t depth = 6 - uvg_g_convert_to_log2[cu_loc->width];
-    uvg_lcu_fill_trdepth(lcu, cu_loc, depth, tree_type);
-  }
   // Refine results with slower search or get some results if rough search was skipped.
   const int32_t rdo_level = state->encoder_control->cfg.rdo;
   if (rdo_level >= 2 || skip_rough_search) {
diff --git a/src/strategies/avx2/quant-avx2.c b/src/strategies/avx2/quant-avx2.c
index 2fc27872..8c7b1c36 100644
--- a/src/strategies/avx2/quant-avx2.c
+++ b/src/strategies/avx2/quant-avx2.c
@@ -707,7 +707,6 @@ int uvg_quantize_residual_avx2(encoder_state_t *const state,
   if (state->encoder_control->cfg.rdoq_enable &&
       (width > 4 || !state->encoder_control->cfg.rdoq_skip) && !use_trskip)
   {
-    int8_t tr_depth = cur_cu->tr_depth - cur_cu->depth;
     uvg_rdoq(state, coeff, coeff_out, width, height, color,
       scan_order, cur_cu->type, cur_cu->cbf, lfnst_index);
   }
diff --git a/src/strategies/generic/quant-generic.c b/src/strategies/generic/quant-generic.c
index bfb92700..81927486 100644
--- a/src/strategies/generic/quant-generic.c
+++ b/src/strategies/generic/quant-generic.c
@@ -315,7 +315,6 @@ int uvg_quant_cbcr_residual_generic(
   if (state->encoder_control->cfg.rdoq_enable &&
     (width > 4 || !state->encoder_control->cfg.rdoq_skip))
   {
-    int8_t tr_depth = cur_cu->tr_depth - cur_cu->depth;
     uvg_rdoq(state, coeff, coeff_out, width, width, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U,
              scan_order, cur_cu->type, cur_cu->cbf, cur_cu->cr_lfnst_idx);
   }
@@ -496,7 +495,6 @@ int uvg_quantize_residual_generic(encoder_state_t *const state,
   if (state->encoder_control->cfg.rdoq_enable &&
       (width > 4 || !state->encoder_control->cfg.rdoq_skip) && !use_trskip)
   {
-    int8_t tr_depth = cur_cu->tr_depth - cur_cu->depth;
     uvg_rdoq(state, coeff, coeff_out, width, height, color,
              scan_order, cur_cu->type, cur_cu->cbf, lfnst_index);
   } else if(state->encoder_control->cfg.rdoq_enable && use_trskip) {
diff --git a/src/transform.c b/src/transform.c
index 1886d91b..83de133b 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -1393,7 +1393,7 @@ void uvg_quantize_lcu_residual(
     cbf_clear(&cur_pu->cbf, COLOR_V);
   }
 
-  if (depth == 0 || cur_pu->tr_depth > depth) {
+  if (cu_loc->width > TR_MAX_WIDTH || cu_loc->height > TR_MAX_WIDTH) {
 
     // Split transform and increase depth
     const int offset = width / 2;
@@ -1420,13 +1420,12 @@ void uvg_quantize_lcu_residual(
       LCU_GET_CU_AT_PX(lcu, lcu_px.x,          lcu_px.y + offset)->cbf,
       LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y + offset)->cbf,
     };
-
-    if (depth <= MAX_DEPTH) {
-      cur_pu->root_cbf = cbf_is_set_any(cur_pu->cbf)
+    
+    cur_pu->root_cbf = cbf_is_set_any(cur_pu->cbf)
       || cbf_is_set_any(child_cbfs[0])
       || cbf_is_set_any(child_cbfs[1])
       || cbf_is_set_any(child_cbfs[2]);
-    }
+    
 
   } else {
     // Process a leaf TU.
@@ -1440,10 +1439,10 @@ void uvg_quantize_lcu_residual(
       quantize_tr_residual(state, COLOR_U, &loc, depth, cur_pu, lcu, early_skip, tree_type);
       quantize_tr_residual(state, COLOR_V, &loc, depth, cur_pu, lcu, early_skip, tree_type);   
     }
-    if (jccr && cur_pu->tr_depth == cur_pu->depth) {
+    if (jccr && PU_IS_TU(cur_pu)) {
       quantize_tr_residual(state, COLOR_UV, &loc, depth, cur_pu, lcu, early_skip, tree_type);
     }
-    if(chroma && jccr && cur_pu->tr_depth == cur_pu->depth) {
+    if(chroma && jccr && PU_IS_TU(cur_pu)) {
       assert( 0 && "Trying to quantize both jccr and regular at the same time.\n");
     }
   }
diff --git a/src/uvg266.h b/src/uvg266.h
index 4ab7ec1f..e3d8c0f9 100644
--- a/src/uvg266.h
+++ b/src/uvg266.h
@@ -338,7 +338,6 @@ typedef struct uvg_config
   int32_t trskip_max_size;    /*!< \brief Transform skip max block size. */
   enum uvg_mts mts;        /*< \brief flag to enable multiple transform selection*/
   int32_t mts_implicit;        /*< \brief flag to enable implicit multiple transform selection*/
-  int32_t tr_depth_intra; /*!< \brief Maximum transform depth for intra. */
   enum uvg_ime_algorithm ime_algorithm;  /*!< \brief Integer motion estimation algorithm. */
   int32_t fme_level;      /*!< \brief Fractional pixel motion estimation level (0: disabled, 1: enabled). */
   int8_t source_scan_type; /*!< \brief Source scan type (0: progressive, 1: top field first, 2: bottom field first).*/

From cfc6aebe3c4b4b8a497d62e28363be71827bdb9c Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Thu, 15 Sep 2022 15:13:48 +0300
Subject: [PATCH 099/254] [mtt] Remove depth from cu_info_t

---
 src/cli.c                |  2 --
 src/cu.h                 |  8 ++----
 src/encode_coding_tree.c | 61 ++++++++++++++++++++--------------------
 src/encoderstate.c       |  5 ++--
 src/filter.c             | 26 -----------------
 src/intra.c              |  7 +++--
 src/search.c             | 19 ++++++-------
 src/search.h             |  2 --
 src/search_inter.c       | 16 +++++------
 src/search_intra.c       |  5 +---
 src/transform.c          | 44 +++++++++--------------------
 src/transform.h          |  1 -
 12 files changed, 71 insertions(+), 125 deletions(-)

diff --git a/src/cli.c b/src/cli.c
index 073fd12e..e831e4ed 100644
--- a/src/cli.c
+++ b/src/cli.c
@@ -76,7 +76,6 @@ static const struct option long_options[] = {
   { "tr-skip-max-size",   required_argument, NULL, 0 },
   { "mts",                required_argument, NULL, 0 },
   { "no-mts",                   no_argument, NULL, 0 },
-  { "tr-depth-intra",     required_argument, NULL, 0 },
   { "me",                 required_argument, NULL, 0 },
   { "subme",              required_argument, NULL, 0 },
   { "source-scan-type",   required_argument, NULL, 0 },
@@ -636,7 +635,6 @@ void print_help(void)
     "                               This is mostly for debugging and is not\n"
     "                               guaranteed to produce sensible bitstream or\n"
     "                               work at all. [disabled]\n"
-    "      --tr-depth-intra <int> : Transform split depth for intra blocks [0]\n"
     "      --(no-)bipred          : Bi-prediction [disabled]\n"
     "      --cu-split-termination <string> : CU split search termination [zero]\n"
     "                                   - off: Don't terminate early.\n"
diff --git a/src/cu.h b/src/cu.h
index 4a2e416c..e05df7de 100644
--- a/src/cu.h
+++ b/src/cu.h
@@ -118,7 +118,6 @@ typedef struct  {
 typedef struct
 {
   uint8_t type        : 3; //!< \brief block type, one of cu_type_t values
-  uint8_t depth       : 3; //!< \brief depth / size of this block
   uint8_t skipped     : 1; //!< \brief flag to indicate this block is skipped
   uint8_t merged      : 1; //!< \brief flag to indicate this block is merged
   uint8_t merge_idx   : 3; //!< \brief merge index
@@ -199,7 +198,7 @@ void uvg_cu_loc_ctor(cu_loc_t *loc, int x, int y, int width, int height);
     } \
   } while (0)
 
-#define CHECKPOINT_CU(prefix_str, cu) CHECKPOINT(prefix_str " type=%d depth=%d part_size=%d coded=%d " \
+#define CHECKPOINT_CU(prefix_str, cu) CHECKPOINT(prefix_str " type=%d part_size=%d coded=%d " \
   "skipped=%d merged=%d merge_idx=%d cbf.y=%d cbf.u=%d cbf.v=%d " \
   "intra[0].cost=%u intra[0].bitcost=%u intra[0].mode=%d intra[0].mode_chroma=%d intra[0].tr_skip=%d " \
   "intra[1].cost=%u intra[1].bitcost=%u intra[1].mode=%d intra[1].mode_chroma=%d intra[1].tr_skip=%d " \
@@ -207,7 +206,7 @@ void uvg_cu_loc_ctor(cu_loc_t *loc, int x, int y, int width, int height);
   "intra[3].cost=%u intra[3].bitcost=%u intra[3].mode=%d intra[3].mode_chroma=%d intra[3].tr_skip=%d " \
   "inter.cost=%u inter.bitcost=%u inter.mv[0]=%d inter.mv[1]=%d inter.mvd[0]=%d inter.mvd[1]=%d " \
   "inter.mv_cand=%d inter.mv_ref=%d inter.mv_dir=%d inter.mode=%d" \
-  , (cu).type, (cu).depth, (cu).part_size, (cu).coded, \
+  , (cu).type, (cu).part_size, (cu).coded, \
   (cu).skipped, (cu).merged, (cu).merge_idx, (cu).cbf.y, (cu).cbf.u, (cu).cbf.v, \
   (cu).intra[0].cost, (cu).intra[0].bitcost, (cu).intra[0].mode, (cu).intra[0].mode_chroma, (cu).intra[0].tr_skip, \
   (cu).intra[1].cost, (cu).intra[1].bitcost, (cu).intra[1].mode, (cu).intra[1].mode_chroma, (cu).intra[1].tr_skip, \
@@ -568,7 +567,7 @@ static INLINE void cbf_set(uint16_t *cbf, color_t plane)
  * Set CBF in a level to true if it is set at a lower level in any of
  * the child_cbfs.
  */
-static INLINE void cbf_set_conditionally(uint16_t *cbf, uint16_t child_cbfs[3], int depth, color_t plane)
+static INLINE void cbf_set_conditionally(uint16_t *cbf, uint16_t child_cbfs[3], color_t plane)
 {
   bool child_cbf_set = cbf_is_set(child_cbfs[0], plane) ||
                        cbf_is_set(child_cbfs[1], plane) ||
@@ -579,7 +578,6 @@ static INLINE void cbf_set_conditionally(uint16_t *cbf, uint16_t child_cbfs[3],
 }
 
 /**
- * Set CBF in a levels <= depth to false.
  */
 static INLINE void cbf_clear(uint16_t *cbf, color_t plane)
 {
diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index 1c7af8d7..dbcf9abe 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -444,7 +444,6 @@ void uvg_encode_last_significant_xy(cabac_data_t * const cabac,
 static void encode_chroma_tu(
   encoder_state_t* const state,
   const cu_loc_t * const cu_loc,
-  int depth,
   cu_info_t* cur_pu,
   int8_t* scan_idx,
   lcu_coeff_t* coeff,
@@ -453,11 +452,11 @@ static void encode_chroma_tu(
   uvg_tree_type tree_type)
 {
   int width_c = cu_loc->chroma_width;
-  //int height_c = cu_loc->chroma_height;
+  int height_c = cu_loc->chroma_height;
   int x_local = ((cu_loc->x >> (tree_type != UVG_CHROMA_T)) & ~3) % LCU_WIDTH_C;
   int y_local = ((cu_loc->y >> (tree_type != UVG_CHROMA_T)) & ~3) % LCU_WIDTH_C;
   cabac_data_t* const cabac = &state->cabac;
-  *scan_idx = uvg_get_scan_order(cur_pu->type, cur_pu->intra.mode_chroma, depth);
+  *scan_idx = SCAN_DIAG;
   if(!joint_chroma){
     // const coeff_t *coeff_u = &coeff->u[xy_to_zorder(LCU_WIDTH_C, x_local, y_local)];
     // const coeff_t *coeff_v = &coeff->v[xy_to_zorder(LCU_WIDTH_C, x_local, y_local)];
@@ -468,7 +467,9 @@ static void encode_chroma_tu(
 
     if (cbf_is_set(cur_pu->cbf, COLOR_U)) {
       // TODO: height for this check and the others below
-      if(state->encoder_control->cfg.trskip_enable && width_c <= (1 << state->encoder_control->cfg.trskip_max_size)){
+      if(state->encoder_control->cfg.trskip_enable 
+        && width_c <= (1 << state->encoder_control->cfg.trskip_max_size)
+        && height_c <= (1 << state->encoder_control->cfg.trskip_max_size)){
         cabac->cur_ctx = &cabac->ctx.transform_skip_model_chroma;
         // HEVC only supports transform_skip for Luma
         // TODO: transform skip for chroma blocks
@@ -478,7 +479,9 @@ static void encode_chroma_tu(
     }
 
     if (cbf_is_set(cur_pu->cbf, COLOR_V)) {
-      if (state->encoder_control->cfg.trskip_enable && width_c <= (1 << state->encoder_control->cfg.trskip_max_size)) {
+      if (state->encoder_control->cfg.trskip_enable 
+        && width_c <= (1 << state->encoder_control->cfg.trskip_max_size)
+        && height_c <= (1 << state->encoder_control->cfg.trskip_max_size)) {
         cabac->cur_ctx = &cabac->ctx.transform_skip_model_chroma;
         CABAC_BIN(cabac, (cur_pu->tr_skip >> COLOR_V) & 1, "transform_skip_flag");
       }
@@ -488,7 +491,9 @@ static void encode_chroma_tu(
   else {
     coeff_t coeff_uv[TR_MAX_WIDTH * TR_MAX_WIDTH];
     uvg_get_sub_coeff(coeff_uv, coeff->joint_uv, x_local, y_local, cu_loc->chroma_width, cu_loc->chroma_height, LCU_WIDTH_C);
-    if (state->encoder_control->cfg.trskip_enable && width_c <= (1 << state->encoder_control->cfg.trskip_max_size)) {
+    if (state->encoder_control->cfg.trskip_enable 
+      && width_c <= (1 << state->encoder_control->cfg.trskip_max_size)
+      && height_c <= (1 << state->encoder_control->cfg.trskip_max_size)) {
       cabac->cur_ctx = &cabac->ctx.transform_skip_model_chroma;
       CABAC_BIN(cabac, 0, "transform_skip_flag");
     }
@@ -500,15 +505,12 @@ static void encode_chroma_tu(
 static void encode_transform_unit(
   encoder_state_t * const state,
   const cu_loc_t *cu_loc,
-  int depth,
   bool only_chroma,
   lcu_coeff_t* coeff,
   enum uvg_tree_type tree_type,
   bool last_split,
   const cu_loc_t *original_loc)               // Original cu dimensions, before CU split
 {
-  assert(depth >= 1 && depth <= MAX_PU_DEPTH);
-
   const videoframe_t * const frame = state->tile->frame;
   cabac_data_t* const cabac = &state->cabac;
   const int x = cu_loc->x;
@@ -524,7 +526,7 @@ static void encode_transform_unit(
   uvg_get_isp_cu_arr_coords(&isp_x, &isp_y);
   const cu_info_t *cur_pu = uvg_cu_array_at_const(used_cu_array, isp_x, isp_y);
 
-  int8_t scan_idx = uvg_get_scan_order(cur_pu->type, cur_pu->intra.mode, depth);
+  int8_t scan_idx = SCAN_DIAG;
 
   int cbf_y = cbf_is_set(cur_pu->cbf, COLOR_Y);
 
@@ -559,7 +561,7 @@ static void encode_transform_unit(
   }
 
   bool joint_chroma = cur_pu->joint_cb_cr != 0;
-  if (depth == MAX_DEPTH) {
+  if (cur_pu->log2_height + cur_pu->log2_width < 6) {
     // For size 4x4 luma transform the corresponding chroma transforms are
     // also of size 4x4 covering 8x8 luma pixels. The residual is coded in
     // the last transform unit.
@@ -578,7 +580,7 @@ static void encode_transform_unit(
   if ((chroma_cbf_set || joint_chroma) && last_split) {
     //Need to drop const to get lfnst constraints
     // Use original dimensions instead of ISP split dimensions
-    encode_chroma_tu(state, original_loc, depth, (cu_info_t*)cur_pu, &scan_idx, coeff, joint_chroma, tree_type);
+    encode_chroma_tu(state, original_loc, (cu_info_t*)cur_pu, &scan_idx, coeff, joint_chroma, tree_type);
   }
 }
 
@@ -593,7 +595,6 @@ static void encode_transform_unit(
 static void encode_transform_coeff(
   encoder_state_t * const state,
   const cu_loc_t * cu_loc,
-  int8_t depth,
   bool only_chroma,
   lcu_coeff_t* coeff,
   enum uvg_tree_type tree_type,
@@ -641,7 +642,7 @@ static void encode_transform_coeff(
         cu_loc_t loc;
         uvg_cu_loc_ctor(&loc, (x + i * split_width), (y + j * split_height), width >> 1, height >> 1);
 
-        encode_transform_coeff(state, &loc, depth + 1, only_chroma, coeff, tree_type, true, luma_cbf_ctx, &loc);
+        encode_transform_coeff(state, &loc, only_chroma, coeff, tree_type, true, luma_cbf_ctx, &loc);
       }
     }
     return;
@@ -650,7 +651,10 @@ static void encode_transform_coeff(
   // No chroma.
   // Not the last CU for area of 64 pixels cowered by more than one luma CU.
   // Not the last ISP Split
-  if (state->encoder_control->chroma_format != UVG_CSP_400 && (depth != 4 || only_chroma) && tree_type != UVG_LUMA_T && last_split) {
+  if (state->encoder_control->chroma_format != UVG_CSP_400 
+    && (cur_pu->log2_height + cur_pu->log2_width >= 6 || only_chroma)
+    && tree_type != UVG_LUMA_T 
+    && last_split) {
     cabac->cur_ctx = &(cabac->ctx.qt_cbf_model_cb[0]);
     CABAC_BIN(cabac, cb_flag_u, "cbf_cb");
     cabac->cur_ctx = &(cabac->ctx.qt_cbf_model_cr[cb_flag_u ? 1 : 0]);
@@ -687,7 +691,7 @@ static void encode_transform_coeff(
   }
 
   if (cb_flag_y | cb_flag_u | cb_flag_v) {
-    if (state->must_code_qp_delta && (only_chroma || cb_flag_y || depth != 4) ) {
+    if (state->must_code_qp_delta && (only_chroma || cb_flag_y || cur_pu->log2_height + cur_pu->log2_width >= 6) ) {
       const int qp_pred      = uvg_get_cu_ref_qp(state, x_cu, y_cu, state->last_qp);
       const int qp_delta     = cur_cu->qp - qp_pred;
       // Possible deltaQP range depends on bit depth as stated in HEVC specification.
@@ -714,7 +718,7 @@ static void encode_transform_coeff(
         ((cb_flag_u || cb_flag_v ) 
           && cur_cu->type == CU_INTRA)
         || (cb_flag_u && cb_flag_v)) 
-      && (depth != 4 || only_chroma || tree_type == UVG_CHROMA_T) 
+      && (cur_pu->log2_height + cur_pu->log2_width >= 6 || only_chroma || tree_type == UVG_CHROMA_T)
       && state->encoder_control->cfg.jccr
       && last_split
       ) {
@@ -723,7 +727,7 @@ static void encode_transform_coeff(
       CABAC_BIN(cabac, cur_pu->joint_cb_cr != 0, "tu_joint_cbcr_residual_flag");
     }
 
-    encode_transform_unit(state, cu_loc, depth, only_chroma, coeff, tree_type, last_split, original_loc);
+    encode_transform_unit(state, cu_loc, only_chroma, coeff, tree_type, last_split, original_loc);
   }
 }
 
@@ -1194,6 +1198,7 @@ void uvg_encode_intra_luma_coding_unit(
   if (cabac->only_count && bits_out) *bits_out += bits;
 }
 
+
 bool uvg_write_split_flag(
   const encoder_state_t * const state,
   cabac_data_t* cabac,
@@ -1362,7 +1367,7 @@ void uvg_encode_coding_tree(
   }
 
   // When not in MAX_DEPTH, insert split flag and split the blocks if needed
-  if (depth != MAX_DEPTH && !(tree_type == UVG_CHROMA_T && depth == MAX_DEPTH -1)) {
+  if (cu_width + cu_height > 8) {
 
     const int split_flag = uvg_write_split_flag(
       state,
@@ -1490,11 +1495,7 @@ void uvg_encode_coding_tree(
       CABAC_BIN(cabac, (cur_cu->type == CU_IBC), "IBCFlag");
     }
   }
-
-  // part_mode
-  //encode_part_mode(state, cabac, cur_cu, depth);
-
-  
+    
 
 #if ENABLE_PCM
   // Code IPCM block
@@ -1577,7 +1578,7 @@ void uvg_encode_coding_tree(
       // Code (possible) coeffs to bitstream
       if (has_coeffs) {
         int luma_cbf_ctx = 0;
-        encode_transform_coeff(state, cu_loc, depth, 0, coeff, tree_type, true, false, &luma_cbf_ctx, cu_loc);
+        encode_transform_coeff(state, cu_loc, 0, coeff, tree_type, true, false, &luma_cbf_ctx, cu_loc);
       }
 
       encode_mts_idx(state, cabac, cur_cu, cu_loc);
@@ -1591,7 +1592,7 @@ void uvg_encode_coding_tree(
     const bool is_local_dual_tree = cu_height * cu_width < 64 && tree_type == UVG_BOTH_T;
 
     // Code chroma prediction mode.
-    if (state->encoder_control->chroma_format != UVG_CSP_400 && depth != 4 && tree_type == UVG_BOTH_T) {
+    if (state->encoder_control->chroma_format != UVG_CSP_400 && cur_cu->log2_height + cur_cu->log2_width >= 6 && tree_type == UVG_BOTH_T) {
       encode_chroma_intra_cu(cabac, cur_cu, state->encoder_control->cfg.cclm, NULL);
     }
     int luma_cbf_ctx = 0;
@@ -1611,7 +1612,7 @@ void uvg_encode_coding_tree(
 
         // Check if last split to write chroma
         bool last_split = (i + 1) == split_limit;
-        encode_transform_coeff(state, &split_loc, depth, 0, coeff, tree_type, last_split, can_skip_last_cbf, &luma_cbf_ctx, cu_loc);
+        encode_transform_coeff(state, &split_loc, 0, coeff, tree_type, last_split, can_skip_last_cbf, &luma_cbf_ctx, cu_loc);
         can_skip_last_cbf &= luma_cbf_ctx == 2;
       }
     }
@@ -1631,7 +1632,7 @@ void uvg_encode_coding_tree(
       tmp->violates_lfnst_constrained_luma = false;
       tmp->violates_lfnst_constrained_chroma = false;
       tmp->lfnst_last_scan_pos = false;
-      encode_transform_coeff(state, cu_loc, depth, 1, coeff, tree_type, true, false, &luma_cbf_ctx, cu_loc);
+      encode_transform_coeff(state, cu_loc, 1, coeff, tree_type, true, false, &luma_cbf_ctx, cu_loc);
       // Write LFNST only once for single tree structure
       encode_lfnst_idx(state, cabac, tmp, is_local_dual_tree ? UVG_CHROMA_T : tree_type, COLOR_UV, cu_loc);
     }
@@ -1696,7 +1697,7 @@ double uvg_mock_encode_coding_unit(
   }
 
   // When not in MAX_DEPTH, insert split flag and split the blocks if needed
-  if (tree_type != UVG_CHROMA_T ? depth != MAX_DEPTH : depth != MAX_DEPTH - 1) {
+  if (cur_cu->log2_height + cur_cu->log2_width > 4) {
     uvg_write_split_flag(
       state,
       cabac,
@@ -1771,7 +1772,7 @@ double uvg_mock_encode_coding_unit(
     if(tree_type != UVG_CHROMA_T) {
       uvg_encode_intra_luma_coding_unit(state, cabac, cur_cu, cu_loc, lcu, &bits);
     }
-    if((depth != 4 || (x % 8 != 0 && y % 8 != 0)) && state->encoder_control->chroma_format != UVG_CSP_400 && tree_type != UVG_LUMA_T) {
+    if((cur_cu->log2_height + cur_cu->log2_width >= 6 || (x % 8 != 0 && y % 8 != 0)) && state->encoder_control->chroma_format != UVG_CSP_400 && tree_type != UVG_LUMA_T) {
       encode_chroma_intra_cu(cabac, cur_cu, state->encoder_control->cfg.cclm, &bits);
     }
   }
diff --git a/src/encoderstate.c b/src/encoderstate.c
index 6557a75f..eb529b2b 100644
--- a/src/encoderstate.c
+++ b/src/encoderstate.c
@@ -2209,11 +2209,12 @@ int uvg_get_cu_ref_qp(const encoder_state_t *state, int x, int y, int last_qp)
 {
   const cu_array_t *cua = state->tile->frame->cu_array;
   // Quantization group width
-  const int qg_width = LCU_WIDTH >> MIN(state->frame->max_qp_delta_depth, uvg_cu_array_at_const(cua, x, y)->depth);
+  const int qg_width = 1 << MAX(6 - state->frame->max_qp_delta_depth, uvg_cu_array_at_const(cua, x, y)->log2_width);
+  const int qg_height = 1 << MAX(6 - state->frame->max_qp_delta_depth, uvg_cu_array_at_const(cua, x, y)->log2_height);
 
   // Coordinates of the top-left corner of the quantization group
   const int x_qg = x & ~(qg_width - 1);
-  const int y_qg = y & ~(qg_width - 1);
+  const int y_qg = y & ~(qg_height - 1);
   if(x_qg == 0 && y_qg > 0 && y_qg % LCU_WIDTH == 0) {
     return uvg_cu_array_at_const(cua, x_qg, y_qg - 1)->qp;
   }
diff --git a/src/filter.c b/src/filter.c
index 10d135a1..2f0b6a1c 100644
--- a/src/filter.c
+++ b/src/filter.c
@@ -307,32 +307,6 @@ static bool is_pu_boundary(const encoder_state_t *const state,
    it for now, in case some other tool requires it.
   */
   return false;
-  //const cu_info_t *const scu =
-  //  uvg_cu_array_at_const(state->tile->frame->cu_array, x, y);
-  //// Get the containing CU.
-  //const int32_t cu_width = LCU_WIDTH >> scu->depth;
-  //const int32_t x_cu = x & ~(cu_width - 1);
-  //const int32_t y_cu = y & ~(cu_width - 1);
-  //const cu_info_t *const cu =
-  //  uvg_cu_array_at_const(state->tile->frame->cu_array, x_cu, y_cu);
-
-  //const int num_pu = uvg_part_mode_num_parts[cu->part_size];
-  //for (int i = 0; i < num_pu; i++) {
-  //  if (dir == EDGE_HOR) {
-  //    int y_pu = PU_GET_Y(cu->part_size, cu_width, y_cu, i);
-  //    if (y_pu == y) {
-  //      return true;
-  //    }
-
-  //  } else {
-  //    int x_pu = PU_GET_X(cu->part_size, cu_width, x_cu, i);
-  //    if (x_pu == x) {
-  //      return true;
-  //    }
-  //  }
-  //}
-
-  //return false;
 }
 
 
diff --git a/src/intra.c b/src/intra.c
index c38544b5..9df9acf7 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -565,7 +565,7 @@ static void predict_cclm(
       y_extension >>= tree_type == UVG_CHROMA_T;
       const cu_info_t* pu = LCU_GET_CU_AT_PX(lcu, (x_scu >> (tree_type == UVG_CHROMA_T)) - 4, y_extension);
       if (y_extension >= ctu_size || pu->type == CU_NOTSET || (pu->type == CU_INTRA && pu->intra.mode_chroma == -1)) break;
-      if(x_scu == 32 && y_scu == 0 && pu->depth == 0) break;
+      if(x_scu == 32 && y_scu == 0 && pu->log2_width == 6) break;
     }
     for(int i = 0; i < height + available_left_below * 2; i++) {
       sampled_luma_ref.left[i] = state->tile->frame->cclm_luma_rec[(y0/2 + i) * (stride2/2) + x0 / 2 - 1];
@@ -1866,8 +1866,9 @@ void uvg_intra_recon_cu(
   // TODO: not necessary to call if only luma and ISP is on
   uvg_quantize_lcu_residual(state, has_luma, has_chroma && !(search_data->pred_cu.joint_cb_cr & 3),
                             search_data->pred_cu.joint_cb_cr & 3 && state->encoder_control->cfg.jccr && has_chroma,
-                            cu_loc, depth, cur_cu, lcu,
-                            false, tree_type);
+                            cu_loc, cur_cu, lcu,
+                            false,
+                            tree_type);
 }
 
 
diff --git a/src/search.c b/src/search.c
index 6852c961..8778d081 100644
--- a/src/search.c
+++ b/src/search.c
@@ -165,7 +165,6 @@ static void lcu_fill_cu_info(lcu_t *lcu, int x_local, int y_local, int width, in
     for (int x = x_local; x < x_local + width; x += SCU_WIDTH) {
       cu_info_t *to = LCU_GET_CU_AT_PX(lcu, x, y);
       to->type      = cu->type;
-      to->depth     = cu->depth;
       to->qp        = cu->qp;
       to->split_tree = cu->split_tree;
       //to->tr_idx    = cu->tr_idx;
@@ -457,7 +456,6 @@ double uvg_cu_rd_cost_chroma(
   }
   
   if (!skip_residual_coding) {
-    const int tr_depth = depth - pred_cu->depth;
     cabac_data_t* cabac = (cabac_data_t*)&state->search_cabac;
     cabac_ctx_t* ctx = &(cabac->ctx.qt_cbf_model_cb[0]);
     cabac->cur_ctx = ctx;
@@ -945,7 +943,6 @@ static double search_cu(
 
   cur_cu = LCU_GET_CU_AT_PX(lcu, x_local, y_local);
   // Assign correct depth
-  cur_cu->depth = (split_tree.current_depth > MAX_DEPTH) ? MAX_DEPTH : split_tree.current_depth;
   cur_cu->type = CU_NOTSET;
   cur_cu->qp = state->qp;
   cur_cu->bdpcmMode = 0;
@@ -1045,7 +1042,8 @@ static double search_cu(
         int8_t intra_mode = intra_search.pred_cu.intra.mode;
 
         // TODO: This heavily relies to square CUs
-        if ((split_tree.current_depth != 4 || (x % 8 && y % 8)) && state->encoder_control->chroma_format != UVG_CSP_400 && tree_type != UVG_LUMA_T) {
+        if ((cur_cu->log2_height + cur_cu->log2_width >= 6 || (x % 8 && y % 8))
+          && state->encoder_control->chroma_format != UVG_CSP_400 && tree_type != UVG_LUMA_T) {
 
           intra_search.pred_cu.joint_cb_cr = 0;
           if(tree_type == UVG_CHROMA_T) {
@@ -1129,7 +1127,7 @@ static double search_cu(
 
       bool recon_chroma = true;
       bool recon_luma = tree_type != UVG_CHROMA_T;
-      if ((split_tree.current_depth == 4) || state->encoder_control->chroma_format == UVG_CSP_400 || tree_type == UVG_LUMA_T) {
+      if ((cur_cu->log2_height + cur_cu->log2_width < 6) || state->encoder_control->chroma_format == UVG_CSP_400 || tree_type == UVG_LUMA_T) {
         recon_chroma = false; 
       }
       lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu);
@@ -1140,7 +1138,7 @@ static double search_cu(
                          recon_luma, recon_chroma);
 
 
-      if(split_tree.current_depth == 4 && x % 8 && y % 8 && tree_type != UVG_LUMA_T && state->encoder_control->chroma_format != UVG_CSP_400) {
+      if(cur_cu->log2_height + cur_cu->log2_width < 6 && x % 8 && y % 8 && tree_type != UVG_LUMA_T && state->encoder_control->chroma_format != UVG_CSP_400) {
         intra_search.pred_cu.intra.mode_chroma = cur_cu->intra.mode_chroma;
         uvg_intra_recon_cu(state,
                            &intra_search, cu_loc,
@@ -1203,11 +1201,10 @@ static double search_cu(
         uvg_quantize_lcu_residual(state,
                                   true, has_chroma && !cur_cu->joint_cb_cr,
                                   cur_cu->joint_cb_cr, &loc,
-                                  depth,
                                   NULL,
                                   lcu,
                                   false,
-          tree_type);
+                                  tree_type);
 
         int cbf = cbf_is_set_any(cur_cu->cbf);
 
@@ -1297,7 +1294,7 @@ static double search_cu(
 
     double split_bits = 0;
 
-    if (split_tree.current_depth < MAX_DEPTH) {
+    if (cur_cu->log2_height + cur_cu->log2_width > 4) {
 
       state->search_cabac.update = 1;
       // Add cost of cu_split_flag.
@@ -1375,7 +1372,7 @@ static double search_cu(
       cu_info_t *cu_d1 = LCU_GET_CU_AT_PX(&work_tree[depth + 1], x_local, y_local);
 
       // If the best CU in depth+1 is intra and the biggest it can be, try it.
-      if (cu_d1->type == CU_INTRA && cu_d1->depth == depth + 1) {
+      if (cu_d1->type == CU_INTRA && (cu_d1->log2_height + 1 == cur_cu->log2_height || cu_d1->log2_width + 1 == cur_cu->log2_width)) {
         cabac_data_t temp_cabac;
         memcpy(&temp_cabac, &state->search_cabac, sizeof(temp_cabac));
         memcpy(&state->search_cabac, &pre_search_cabac, sizeof(pre_search_cabac));
@@ -1452,7 +1449,7 @@ static double search_cu(
         state, x, y, cu_width / 2, cu_width / 2, lcu->rec.y, lcu->left_ref.y[64]
       );      
     }
-  } else if (depth >= 0 && depth < MAX_PU_DEPTH) {
+  } else if (cur_cu->log2_height + cur_cu->log2_width > 4) {
     // Need to copy modes down since the lower level of the work tree is used
     // when searching SMP and AMP blocks.
     work_tree_copy_down(depth, work_tree, tree_type, cu_loc);
diff --git a/src/search.h b/src/search.h
index af2ae2a7..809a4635 100644
--- a/src/search.h
+++ b/src/search.h
@@ -96,7 +96,5 @@ double uvg_cu_rd_cost_chroma(
   lcu_t *const lcu,
   const cu_loc_t * const);
 
-void uvg_intra_recon_lcu_luma(encoder_state_t * const state, int x, int y, int depth, int8_t intra_mode, cu_info_t *cur_cu, lcu_t *lcu);
-void uvg_intra_recon_lcu_chroma(encoder_state_t * const state, int x, int y, int depth, int8_t intra_mode, cu_info_t *cur_cu, lcu_t *lcu);
 
 #endif
diff --git a/src/search_inter.c b/src/search_inter.c
index 9a4ac572..353eda31 100644
--- a/src/search_inter.c
+++ b/src/search_inter.c
@@ -1813,7 +1813,7 @@ static void search_pu_inter(
         cur_pu->inter.mv[1][1]  = info->merge_cand[merge_idx].mv[1][1];
         uvg_inter_recon_cu(state, lcu, true, false, cu_loc);
 
-        uvg_quantize_lcu_residual(state, true, false, false, cu_loc, depth, cur_pu, lcu, true, UVG_BOTH_T);
+        uvg_quantize_lcu_residual(state, true, false, false, cu_loc, cur_pu, lcu, true, UVG_BOTH_T);
 
         if (cbf_is_set(cur_pu->cbf, COLOR_Y)) {
           continue;
@@ -1823,9 +1823,9 @@ static void search_pu_inter(
           uvg_quantize_lcu_residual(state,
                                     false, has_chroma,
                                     false, /*we are only checking for lack of coeffs so no need to check jccr*/
-                                    cu_loc, depth, cur_pu, lcu,
+                                    cu_loc, cur_pu, lcu,
                                     true,
-            UVG_BOTH_T);
+                                    UVG_BOTH_T);
           if (!cbf_is_set_any(cur_pu->cbf)) {
             cur_pu->type = CU_INTER;
             cur_pu->merge_idx = merge_idx;
@@ -2181,11 +2181,10 @@ void uvg_cu_cost_inter_rd2(
                               false,
                               false,
                               cu_loc,
-                              depth,
                               cur_cu,
                               lcu,
-                              false, 
-      UVG_BOTH_T);
+                              false,
+                              UVG_BOTH_T);
     ALIGNED(64) uvg_pixel u_pred[LCU_WIDTH_C * LCU_WIDTH_C];
     ALIGNED(64) uvg_pixel v_pred[LCU_WIDTH_C * LCU_WIDTH_C];
     uvg_pixels_blit(&lcu->ref.u[index], u_pred, width, width, LCU_WIDTH_C, width);
@@ -2246,11 +2245,10 @@ void uvg_cu_cost_inter_rd2(
                               true, reconstruct_chroma,
                               reconstruct_chroma && state->encoder_control->cfg.jccr,
                               cu_loc,
-                              depth,
                               cur_cu,
                               lcu,
-                              false, 
-      UVG_BOTH_T);    
+                              false,
+                              UVG_BOTH_T);    
   }
 
   int cbf = cbf_is_set_any(cur_cu->cbf);
diff --git a/src/search_intra.c b/src/search_intra.c
index c90a2a5e..d6f0653f 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -271,8 +271,6 @@ static double search_intra_trdepth(
   lcu_t *const lcu,
   enum uvg_tree_type tree_type)
 {
-
-  const uint8_t depth = 6 - uvg_g_convert_to_log2[cu_loc->width];
   const uint8_t width = cu_loc->width;
   const uint8_t height = cu_loc->height; // TODO: height for non-square blocks
   const uint8_t width_c = cu_loc->chroma_width;
@@ -595,7 +593,7 @@ static double search_intra_trdepth(
     }
   }
 
-  if (depth == 0 || split_cost < nosplit_cost) {
+  if (!PU_IS_TU(pred_cu) || split_cost < nosplit_cost) {
     return split_cost;
   } else {
     return nosplit_cost;
@@ -1306,7 +1304,6 @@ static int8_t search_intra_rdo(
   enum uvg_tree_type tree_type,
   const cu_loc_t* const cu_loc)
 {
-  const int8_t depth = 6 - uvg_g_convert_to_log2[cu_loc->width];
   const int width = cu_loc->width;
   const int height = cu_loc->height; // TODO: height for non-square blocks
   
diff --git a/src/transform.c b/src/transform.c
index 83de133b..5e76c82f 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -844,16 +844,13 @@ void uvg_fwd_lfnst(
   const uint16_t lfnst_index = lfnst_idx;
   int8_t intra_mode = (color == COLOR_Y) ? cur_cu->intra.mode : cur_cu->intra.mode_chroma;
   bool mts_skip = cur_cu->tr_idx == MTS_SKIP;
-  const int depth = cur_cu->depth;
-  bool is_separate_tree = depth == 4 || tree_type != UVG_BOTH_T; 
+  bool is_separate_tree = cur_cu->log2_height + cur_cu->log2_width < 6 || tree_type != UVG_BOTH_T;
   bool is_cclm_mode = (intra_mode >= 81 && intra_mode <= 83); // CCLM modes are in [81, 83]
 
   bool is_mip = block_is_mip(cur_cu, color, is_separate_tree);
   bool is_wide_angle = false; // TODO: get wide angle mode when implemented
-
-  const int cu_type = cur_cu->type;
-
-  const int scan_order = uvg_get_scan_order(cu_type, intra_mode, depth);
+  
+  const int scan_order = SCAN_DIAG;
 
   if (lfnst_index && !mts_skip && (is_separate_tree || color == COLOR_Y))
   {
@@ -981,16 +978,13 @@ void uvg_inv_lfnst(
   const uint32_t  lfnst_index = lfnst_idx;
   int8_t intra_mode = (color == COLOR_Y) ? cur_cu->intra.mode : cur_cu->intra.mode_chroma;
   bool mts_skip = cur_cu->tr_idx == MTS_SKIP;
-  const int depth = cur_cu->depth;
-  bool is_separate_tree = depth == 4 || tree_type != UVG_BOTH_T;
+  bool is_separate_tree = cur_cu->log2_height + cur_cu->log2_width < 6 || tree_type != UVG_BOTH_T;
   bool is_cclm_mode = (intra_mode >= 81 && intra_mode <= 83); // CCLM modes are in [81, 83]
 
   bool is_mip = block_is_mip(cur_cu, color, is_separate_tree);
   bool is_wide_angle = false; // TODO: get wide angle mode when implemented
-
-  const int cu_type = cur_cu->type;
-
-  const int scan_order = uvg_get_scan_order(cu_type, intra_mode, depth);
+  
+  const int scan_order = SCAN_DIAG;
   
   if (lfnst_index && !mts_skip && (is_separate_tree || color == COLOR_Y)) {
     const uint32_t log2_block_size = uvg_g_convert_to_log2[width];
@@ -1148,7 +1142,6 @@ static void quantize_tr_residual(
   encoder_state_t * const state,
   const color_t color,
   const cu_loc_t *cu_loc,
-  const uint8_t depth,
   cu_info_t *cur_pu,
   lcu_t* lcu,
   bool early_skip,
@@ -1164,7 +1157,7 @@ static void quantize_tr_residual(
   // If luma is 4x4, do chroma for the 8x8 luma area when handling the top
   // left PU because the coordinates are correct.
   bool handled_elsewhere = color != COLOR_Y &&
-                           depth == MAX_DEPTH &&
+                           cur_pu->log2_width + cur_pu-> log2_height < 6&&
                            (x % 4 != 0 || y % 4 != 0);
   if (handled_elsewhere) {
     return;
@@ -1181,8 +1174,7 @@ static void quantize_tr_residual(
   const int8_t mode =
     (color == COLOR_Y) ? cur_pu->intra.mode : cur_pu->intra.mode_chroma;
   
-  const coeff_scan_order_t scan_idx =
-    uvg_get_scan_order(cur_pu->type, mode, depth); // Height does not affect this
+  const coeff_scan_order_t scan_idx = SCAN_DIAG; 
   const int offset = lcu_px.x + lcu_px.y * lcu_width;
   //const int z_index = xy_to_zorder(lcu_width, lcu_px.x, lcu_px.y);
 
@@ -1355,7 +1347,6 @@ void uvg_quantize_lcu_residual(
   const bool chroma,
   const bool jccr,
   const cu_loc_t * cu_loc,
-  const uint8_t depth,
   cu_info_t *cur_pu,
   lcu_t* lcu,
   bool early_skip,
@@ -1402,17 +1393,10 @@ void uvg_quantize_lcu_residual(
         cu_loc_t loc;
         uvg_cu_loc_ctor(&loc, (x + i * offset), (y + j * offset), width >> 1, height >> 1);
         // jccr is currently not supported if transform is split
-        uvg_quantize_lcu_residual(state, luma, chroma, 0, &loc, depth + 1, NULL, lcu, early_skip, tree_type);
+        uvg_quantize_lcu_residual(state, luma, chroma, 0, &loc, NULL, lcu, early_skip, tree_type);
       }
     }
-
-    //const int32_t x2 = x + offset;
-    //const int32_t y2 = y + offset;
-
-    //uvg_quantize_lcu_residual(state, luma, chroma, 0,  x,  y, depth + 1, NULL, lcu, early_skip, tree_type);
-    //uvg_quantize_lcu_residual(state, luma, chroma, 0, x2,  y, depth + 1, NULL, lcu, early_skip, tree_type);
-    //uvg_quantize_lcu_residual(state, luma, chroma, 0,  x, y2, depth + 1, NULL, lcu, early_skip, tree_type);
-    //uvg_quantize_lcu_residual(state, luma, chroma, 0, x2, y2, depth + 1, NULL, lcu, early_skip, tree_type);
+    
 
     // Propagate coded block flags from child CUs to parent CU.
     uint16_t child_cbfs[3] = {
@@ -1433,14 +1417,14 @@ void uvg_quantize_lcu_residual(
     uvg_cu_loc_ctor(&loc, x, y, width, height);
 
     if (luma) {
-      quantize_tr_residual(state, COLOR_Y, &loc, depth, cur_pu, lcu, early_skip, tree_type);
+      quantize_tr_residual(state, COLOR_Y, &loc, cur_pu, lcu, early_skip, tree_type);
     }
     if (chroma) {
-      quantize_tr_residual(state, COLOR_U, &loc, depth, cur_pu, lcu, early_skip, tree_type);
-      quantize_tr_residual(state, COLOR_V, &loc, depth, cur_pu, lcu, early_skip, tree_type);   
+      quantize_tr_residual(state, COLOR_U, &loc, cur_pu, lcu, early_skip, tree_type);
+      quantize_tr_residual(state, COLOR_V, &loc, cur_pu, lcu, early_skip, tree_type);   
     }
     if (jccr && PU_IS_TU(cur_pu)) {
-      quantize_tr_residual(state, COLOR_UV, &loc, depth, cur_pu, lcu, early_skip, tree_type);
+      quantize_tr_residual(state, COLOR_UV, &loc, cur_pu, lcu, early_skip, tree_type);
     }
     if(chroma && jccr && PU_IS_TU(cur_pu)) {
       assert( 0 && "Trying to quantize both jccr and regular at the same time.\n");
diff --git a/src/transform.h b/src/transform.h
index 50a3f7de..ebe31109 100644
--- a/src/transform.h
+++ b/src/transform.h
@@ -96,7 +96,6 @@ void uvg_quantize_lcu_residual(
   bool chroma,
   const bool jccr,
   const cu_loc_t* cu_loc,
-  uint8_t depth,
   cu_info_t *cur_cu,
   lcu_t* lcu,
   bool early_skip,

From 58c6af8c87ceeedd62e8e7b4d40aaa775f7c59e0 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Mon, 19 Sep 2022 08:44:49 +0300
Subject: [PATCH 100/254] [mtt] Add function for easily getting all split
 cu_locs

---
 src/cu.c     | 39 +++++++++++++++++++++++++++++++++++++++
 src/cu.h     |  1 +
 src/search.c | 15 ++++++---------
 3 files changed, 46 insertions(+), 9 deletions(-)

diff --git a/src/cu.c b/src/cu.c
index aedf341c..4ae74da0 100644
--- a/src/cu.c
+++ b/src/cu.c
@@ -315,3 +315,42 @@ void uvg_cu_loc_ctor(cu_loc_t* loc, int x, int y, int width, int height)
   loc->chroma_width = MAX(width >> 1, 4);
   loc->chroma_height = MAX(height >> 1, 4);
 }
+
+
+void uvg_get_split_locs(const cu_loc_t* const origin, enum split_type split, cu_loc_t out[4])
+{
+  const int half_width = origin->width >> 1;
+  const int half_height = origin->height >> 1;
+  const int quarter_width = origin->width >> 2;
+  const int quarter_height = origin->height >> 2;
+
+  switch (split) {
+    case NO_SPLIT:
+      assert(0 && "trying to get split from no split");
+    break;
+    case QT_SPLIT:
+      uvg_cu_loc_ctor(&out[0], origin->x, origin->y, half_width, half_height);
+      uvg_cu_loc_ctor(&out[1], origin->x + half_width, origin->y, half_width, half_height);
+      uvg_cu_loc_ctor(&out[2], origin->x, origin->y + half_height, half_width, half_height);
+      uvg_cu_loc_ctor(&out[3], origin->x + half_width, origin->y + half_height, half_width, half_height);
+    break;
+    case BT_HOR_SPLIT:
+      uvg_cu_loc_ctor(&out[0], origin->x, origin->y, origin->width, half_height);
+      uvg_cu_loc_ctor(&out[1], origin->x, origin->y + half_height, origin->width, half_height);
+    break;
+    case BT_VER_SPLIT:
+      uvg_cu_loc_ctor(&out[0], origin->x, origin->y, half_width, origin->height);
+      uvg_cu_loc_ctor(&out[1], origin->x + half_width, origin->y, half_width, origin->height);
+    break;
+    case TT_HOR_SPLIT:
+      uvg_cu_loc_ctor(&out[0], origin->x, origin->y, origin->width, quarter_height);
+      uvg_cu_loc_ctor(&out[1], origin->x, origin->y + quarter_height, origin->width, half_height);
+      uvg_cu_loc_ctor(&out[2], origin->x, origin->y + quarter_height + half_height, origin->width, quarter_height);
+    break;
+    case TT_VER_SPLIT:
+      uvg_cu_loc_ctor(&out[0], origin->x, origin->y, quarter_width, origin->height);
+      uvg_cu_loc_ctor(&out[1], origin->x + quarter_width, origin->y, half_width, origin->height);
+      uvg_cu_loc_ctor(&out[2], origin->x + quarter_width + half_width, origin->y, quarter_width, origin->height);
+    break;
+  }
+}
\ No newline at end of file
diff --git a/src/cu.h b/src/cu.h
index e05df7de..f9021b3c 100644
--- a/src/cu.h
+++ b/src/cu.h
@@ -185,6 +185,7 @@ typedef struct {
 
 void uvg_cu_loc_ctor(cu_loc_t *loc, int x, int y, int width, int height);
 
+void uvg_get_split_locs(const cu_loc_t* const origin, enum split_type split, cu_loc_t out[4]);
 
 #define CU_GET_MV_CAND(cu_info_ptr, reflist) \
   (((reflist) == 0) ? (cu_info_ptr)->inter.mv_cand0 : (cu_info_ptr)->inter.mv_cand1)
diff --git a/src/search.c b/src/search.c
index 8778d081..bcfd82bc 100644
--- a/src/search.c
+++ b/src/search.c
@@ -1336,22 +1336,19 @@ static double search_cu(
     // It is ok to interrupt the search as soon as it is known that
     // the split costs at least as much as not splitting.
     if (cur_cu->type == CU_NOTSET || cbf || state->encoder_control->cfg.cu_split_termination == UVG_CU_SPLIT_TERMINATION_OFF) {
-      cu_loc_t new_cu_loc;
+      cu_loc_t new_cu_loc[4];
+      uvg_get_split_locs(cu_loc, QT_SPLIT, new_cu_loc);
       if (split_cost < cost) {
-        uvg_cu_loc_ctor(&new_cu_loc, x, y, half_cu, half_cu);
-        split_cost += search_cu(state, &new_cu_loc, work_tree, tree_type, new_split);
+        split_cost += search_cu(state, &new_cu_loc[0], work_tree, tree_type, new_split);
       }
       if (split_cost < cost) {
-        uvg_cu_loc_ctor(&new_cu_loc, x + half_cu, y, half_cu, half_cu);
-        split_cost += search_cu(state, &new_cu_loc, work_tree, tree_type, new_split);
+        split_cost += search_cu(state, &new_cu_loc[1], work_tree, tree_type, new_split);
       }
       if (split_cost < cost) {
-        uvg_cu_loc_ctor(&new_cu_loc, x, y + half_cu, half_cu, half_cu);
-        split_cost += search_cu(state, &new_cu_loc, work_tree, tree_type, new_split);
+        split_cost += search_cu(state, &new_cu_loc[2], work_tree, tree_type, new_split);
       }
       if (split_cost < cost) {
-        uvg_cu_loc_ctor(&new_cu_loc, x + half_cu, y + half_cu, half_cu, half_cu);
-        split_cost += search_cu(state, &new_cu_loc, work_tree, tree_type, new_split);
+        split_cost += search_cu(state, &new_cu_loc[3], work_tree, tree_type, new_split);
       }
     } else {
       split_cost = INT_MAX;

From 274e71dff6952a0297a4f4a9d9196988d28770be Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Tue, 8 Nov 2022 13:01:27 +0200
Subject: [PATCH 101/254] [transform] Simplify chroma transform search a bit

---
 src/transform.c | 57 ++++++++++++++++++++++---------------------------
 1 file changed, 26 insertions(+), 31 deletions(-)

diff --git a/src/transform.c b/src/transform.c
index 5e76c82f..fbd6afeb 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -415,14 +415,11 @@ static void generate_jccr_transforms(
 
 static void quantize_chroma(
   encoder_state_t* const state,
-  int depth,
   int8_t width,
   int8_t height,
   coeff_t u_coeff[5120],
   coeff_t v_coeff[2048],
-  enum uvg_chroma_transforms transforms[5],
-  const int trans_offset,
-  int i,
+  enum uvg_chroma_transforms transform,
   coeff_t u_quant_coeff[1024],
   coeff_t v_quant_coeff[1024],
   const coeff_scan_order_t scan_order,
@@ -431,9 +428,9 @@ static void quantize_chroma(
   uint8_t lfnst_idx)
 {
   if (state->encoder_control->cfg.rdoq_enable &&
-    (transforms[i] != CHROMA_TS || !state->encoder_control->cfg.rdoq_skip))
+    (transform != CHROMA_TS || !state->encoder_control->cfg.rdoq_skip))
   {
-    uvg_rdoq(state, &u_coeff[i * trans_offset], u_quant_coeff, width, height, transforms[i] != JCCR_1 ? COLOR_U : COLOR_V,
+    uvg_rdoq(state, u_coeff, u_quant_coeff, width, height, transform != JCCR_1 ? COLOR_U : COLOR_V,
              scan_order, CU_INTRA, 0, lfnst_idx);
 
     int j;
@@ -444,25 +441,25 @@ static void quantize_chroma(
       }
     }
 
-    if (transforms[i] == DCT7_CHROMA) {
+    if (transform == DCT7_CHROMA) {
       uint16_t temp_cbf = 0;
       if (*u_has_coeffs)cbf_set(&temp_cbf, COLOR_U);
-      uvg_rdoq(state, &v_coeff[i * trans_offset], v_quant_coeff, width, height, COLOR_V,
+      uvg_rdoq(state, v_coeff, v_quant_coeff, width, height, COLOR_V,
                scan_order, CU_INTRA, temp_cbf, lfnst_idx);
 
     }
   }
-  else if (state->encoder_control->cfg.rdoq_enable && transforms[i] == CHROMA_TS) {
-    uvg_ts_rdoq(state, &u_coeff[i * trans_offset], u_quant_coeff, width, height, COLOR_U, scan_order);
-    uvg_ts_rdoq(state, &v_coeff[i * trans_offset], v_quant_coeff, width, height, COLOR_V, scan_order);
+  else if (state->encoder_control->cfg.rdoq_enable && transform == CHROMA_TS) {
+    uvg_ts_rdoq(state, u_coeff, u_quant_coeff, width, height, COLOR_U, scan_order);
+    uvg_ts_rdoq(state, v_coeff, v_quant_coeff, width, height, COLOR_V, scan_order);
   }
   else {
-    uvg_quant(state, &u_coeff[i * trans_offset], u_quant_coeff, width, height, transforms[i] != JCCR_1 ? COLOR_U : COLOR_V,
-      scan_order, CU_INTRA, transforms[i] == CHROMA_TS, lfnst_idx);
+    uvg_quant(state, u_coeff, u_quant_coeff, width, height, transform != JCCR_1 ? COLOR_U : COLOR_V,
+      scan_order, CU_INTRA, transform == CHROMA_TS, lfnst_idx);
 
-    if (!IS_JCCR_MODE(transforms[i])) {
-      uvg_quant(state, &v_coeff[i * trans_offset], v_quant_coeff, width, height, COLOR_V,
-        scan_order, CU_INTRA, transforms[i] == CHROMA_TS, lfnst_idx);
+    if (!IS_JCCR_MODE(transform)) {
+      uvg_quant(state, v_coeff, v_quant_coeff, width, height, COLOR_V,
+        scan_order, CU_INTRA, transform == CHROMA_TS, lfnst_idx);
     }
   }
 
@@ -472,7 +469,7 @@ static void quantize_chroma(
       break;
     }
   }
-  if (!IS_JCCR_MODE(transforms[i])) {
+  if (!IS_JCCR_MODE(transform)) {
     for (int j = 0; j < width * height; ++j) {
       if (v_quant_coeff[j]) {
         *v_has_coeffs = 1;
@@ -498,7 +495,7 @@ void uvg_chroma_transform_search(
 {
   ALIGNED(64) coeff_t u_coeff[LCU_WIDTH_C * LCU_WIDTH_C * 5];
   ALIGNED(64) uint8_t u_recon[LCU_WIDTH_C * LCU_WIDTH_C * 5];
-  ALIGNED(64) coeff_t v_coeff[LCU_WIDTH_C * LCU_WIDTH_C * 2];
+  ALIGNED(64) coeff_t v_coeff[LCU_WIDTH_C * LCU_WIDTH_C * 2]; // In case of JCCR the v channel does not have coefficients
   ALIGNED(64) uint8_t v_recon[LCU_WIDTH_C * LCU_WIDTH_C * 5];
   const int width  = cu_loc->chroma_width;
   const int height = cu_loc->chroma_height;
@@ -553,22 +550,20 @@ void uvg_chroma_transform_search(
     int16_t v_recon_resi[LCU_WIDTH_C * LCU_WIDTH_C];
     bool u_has_coeffs = false;
     bool v_has_coeffs = false;
+    bool is_jccr = IS_JCCR_MODE(transforms[i]);
     if(pred_cu->cr_lfnst_idx) {
       uvg_fwd_lfnst(pred_cu, width, height, COLOR_U, pred_cu->cr_lfnst_idx, &u_coeff[i * trans_offset], tree_type);
-      if (!IS_JCCR_MODE(transforms[i])) {
+      if (!is_jccr) {
         uvg_fwd_lfnst(pred_cu, width, height, COLOR_V, pred_cu->cr_lfnst_idx, &v_coeff[i * trans_offset], tree_type);
       }
     }
     quantize_chroma(
       state,
-      depth,
       width,
       height,
-      u_coeff,
-      v_coeff,
-      transforms,
-      trans_offset,
-      i,
+      &u_coeff[i * trans_offset],
+      &v_coeff[i * trans_offset],
+      transforms[i],
       u_quant_coeff,
       v_quant_coeff,
       SCAN_DIAG,
@@ -580,13 +575,13 @@ void uvg_chroma_transform_search(
     if(pred_cu->type == CU_INTRA && transforms[i] != CHROMA_TS && (cu_loc->width == 4 || tree_type == UVG_CHROMA_T)) {
       bool constraints[2] = { false, false };
       uvg_derive_lfnst_constraints(pred_cu, constraints, u_quant_coeff, width, height, NULL, COLOR_U);
-      if(!IS_JCCR_MODE(transforms[i])) {
+      if(!is_jccr) {
         uvg_derive_lfnst_constraints(pred_cu, constraints, v_quant_coeff, width, height, NULL, COLOR_V);
       }
       if (!constraints[1] && (u_has_coeffs || v_has_coeffs) && pred_cu->cr_lfnst_idx != 0) continue;
     }
 
-    if (IS_JCCR_MODE(transforms[i]) && !u_has_coeffs) continue;
+    if (is_jccr && !u_has_coeffs) continue;
 
     if (u_has_coeffs) {
       uvg_dequant(state, u_quant_coeff, &u_coeff[i * trans_offset], width, width, transforms[i] != JCCR_1 ? COLOR_U : COLOR_V,
@@ -619,7 +614,7 @@ void uvg_chroma_transform_search(
     }
 
 
-    if (v_has_coeffs && !(IS_JCCR_MODE(transforms[i]))) {
+    if (v_has_coeffs && !is_jccr) {
       uvg_dequant(state, v_quant_coeff, &v_coeff[i * trans_offset], width, width, COLOR_V,
         pred_cu->type, transforms[i] == CHROMA_TS);
 
@@ -638,7 +633,7 @@ void uvg_chroma_transform_search(
         v_recon[trans_offset * i + j] = CLIP_TO_PIXEL(v_pred[j] + v_recon_resi[j]);
       }
     }
-    else if (u_has_coeffs && IS_JCCR_MODE(transforms[i])) {
+    else if (u_has_coeffs && is_jccr) {
       if (transforms[i] == JCCR_1) {
         for (int j = 0; j < width * height; j++) {
           v_recon[trans_offset * i + j] = CLIP_TO_PIXEL(v_pred[j] + u_recon_resi[j]);
@@ -706,7 +701,7 @@ void uvg_chroma_transform_search(
         COEFF_ORDER_LINEAR);
       u_bits += coeff_cost;
     }
-    if (cbf_v && !IS_JCCR_MODE(transforms[i])) {
+    if (cbf_v && !is_jccr) {
       if (can_use_tr_skip) {
         CABAC_FBITS_UPDATE(&state->search_cabac, &state->search_cabac.ctx.transform_skip_model_chroma,
           transforms[i] == CHROMA_TS, v_bits, "tr_skip_v"
@@ -743,7 +738,7 @@ void uvg_chroma_transform_search(
       pred_cu->lfnst_last_scan_pos = false;
       pred_cu->violates_lfnst_constrained_chroma = false;
     }
-    if (!IS_JCCR_MODE(transforms[i])) {
+    if (!is_jccr) {
       double u_cost = UVG_CHROMA_MULT * ssd_u + u_bits * state->frame->lambda;
       double v_cost = UVG_CHROMA_MULT * ssd_v + v_bits * state->frame->lambda;
       if (u_cost < chorma_ts_out->best_u_cost) {

From 924a93b60e27c4ee880d0bf381d9577cb54f4559 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Tue, 8 Nov 2022 14:52:36 +0200
Subject: [PATCH 102/254] [mtt] Only initialize higher depth ctus partially

---
 src/search.c | 52 ++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 40 insertions(+), 12 deletions(-)

diff --git a/src/search.c b/src/search.c
index bcfd82bc..77c8b1ff 100644
--- a/src/search.c
+++ b/src/search.c
@@ -75,6 +75,44 @@ static INLINE void copy_cu_info(lcu_t *from, lcu_t *to, const cu_loc_t* const cu
   }
 }
 
+
+static INLINE void initialize_partial_work_tree(lcu_t* from, lcu_t *to, const cu_loc_t * const cu_loc, enum uvg_tree_type tree_type) {
+
+  const int y_limit = MAX((cu_loc->local_y + cu_loc->height), LCU_WIDTH) >> (tree_type == UVG_CHROMA_T);
+  const int x_limit = MAX(cu_loc->local_x + cu_loc->width, LCU_WIDTH) >> (tree_type == UVG_CHROMA_T);
+
+  if (cu_loc->local_x == 0) {
+    to->left_ref = from->left_ref;
+  }
+  if (cu_loc->local_y == 0) {
+    to->top_ref = from->top_ref;
+  }
+
+  to->ref.chroma_format = from->ref.chroma_format;
+
+  if (tree_type != UVG_CHROMA_T) {
+    const int offset = cu_loc->local_x + cu_loc->local_y * LCU_WIDTH;
+    uvg_pixels_blit(&from->ref.y[offset], &to->ref.y[offset], cu_loc->width, cu_loc->height, LCU_WIDTH, LCU_WIDTH);
+  }
+
+  if(tree_type != UVG_LUMA_T && from->ref.chroma_format != UVG_CSP_400) {
+    const int offset = cu_loc->local_x / 2 + cu_loc->local_y / 2 * LCU_WIDTH_C;
+    uvg_pixels_blit(&from->ref.u[offset], &to->ref.u[offset], cu_loc->chroma_width, cu_loc->chroma_height, LCU_WIDTH_C, LCU_WIDTH_C);
+    uvg_pixels_blit(&from->ref.v[offset], &to->ref.v[offset], cu_loc->chroma_width, cu_loc->chroma_height, LCU_WIDTH_C, LCU_WIDTH_C);
+  } 
+
+  const int y_start = (cu_loc->local_y >> (tree_type == UVG_CHROMA_T)) - 4;
+  const int x_start = (cu_loc->local_x >> (tree_type == UVG_CHROMA_T)) - 4;
+  for (int y = y_start; y < y_limit; y += SCU_WIDTH) {
+    const int temp = LCU_CU_OFFSET + ((x_start) >> 2) + ((y) >> 2) * LCU_T_CU_WIDTH;
+    *LCU_GET_CU_AT_PX(to, x_start, y) = *LCU_GET_CU_AT_PX(from, x_start, y);
+
+  }
+  for (int x = x_start; x < x_limit; x += SCU_WIDTH) {
+    *LCU_GET_CU_AT_PX(to, x, y_start) = *LCU_GET_CU_AT_PX(from, x, y_start);
+  }
+}
+
 static INLINE void copy_cu_pixels(
   lcu_t *from,
   lcu_t *to,
@@ -942,18 +980,10 @@ static double search_cu(
   pu_depth_inter.max = ctrl->cfg.pu_depth_inter.max[gop_layer] >= 0 ? ctrl->cfg.pu_depth_inter.max[gop_layer] : ctrl->cfg.pu_depth_inter.max[0];
 
   cur_cu = LCU_GET_CU_AT_PX(lcu, x_local, y_local);
+  memset(cur_cu, 0, sizeof(cu_info_t));
   // Assign correct depth
   cur_cu->type = CU_NOTSET;
   cur_cu->qp = state->qp;
-  cur_cu->bdpcmMode = 0;
-  cur_cu->tr_idx = 0;
-  cur_cu->violates_mts_coeff_constraint = 0;
-  cur_cu->mts_last_scan_pos = 0;
-  cur_cu->violates_lfnst_constrained_luma = 0;
-  cur_cu->violates_lfnst_constrained_chroma = 0;
-  cur_cu->lfnst_last_scan_pos = 0;
-  cur_cu->lfnst_idx = 0;
-  cur_cu->joint_cb_cr = 0;
   cur_cu->split_tree = split_tree.split_tree;
   cur_cu->log2_width = uvg_g_convert_to_log2[cu_width];
   cur_cu->log2_height = uvg_g_convert_to_log2[cu_height];
@@ -1336,6 +1366,7 @@ static double search_cu(
     // It is ok to interrupt the search as soon as it is known that
     // the split costs at least as much as not splitting.
     if (cur_cu->type == CU_NOTSET || cbf || state->encoder_control->cfg.cu_split_termination == UVG_CU_SPLIT_TERMINATION_OFF) {
+      initialize_partial_work_tree(&work_tree[depth], &work_tree[depth + 1], cu_loc, tree_type);
       cu_loc_t new_cu_loc[4];
       uvg_get_split_locs(cu_loc, QT_SPLIT, new_cu_loc);
       if (split_cost < cost) {
@@ -1651,9 +1682,6 @@ void uvg_search_lcu(encoder_state_t * const state, const int x, const int y, con
   // process.
   lcu_t work_tree[MAX_PU_DEPTH + 1];
   init_lcu_t(state, x, y, &work_tree[0], hor_buf, ver_buf);
-  for (int depth = 1; depth <= MAX_PU_DEPTH; ++depth) {
-    work_tree[depth] = work_tree[0];
-  }
 
   // If the ML depth prediction is enabled, 
   // generate the depth prediction interval 

From 1cf1501542fd7e7f854e291225ae3d6039eecf6f Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 9 Nov 2022 07:34:41 +0200
Subject: [PATCH 103/254] [mtt] fix

---
 src/search.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/search.c b/src/search.c
index 77c8b1ff..3cbf9b00 100644
--- a/src/search.c
+++ b/src/search.c
@@ -89,6 +89,7 @@ static INLINE void initialize_partial_work_tree(lcu_t* from, lcu_t *to, const cu
   }
 
   to->ref.chroma_format = from->ref.chroma_format;
+  to->rec.chroma_format = from->rec.chroma_format;
 
   if (tree_type != UVG_CHROMA_T) {
     const int offset = cu_loc->local_x + cu_loc->local_y * LCU_WIDTH;

From 239ee8830607f1071ed6b52e5f50ac2b147a5dbb Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Thu, 10 Nov 2022 08:37:03 +0200
Subject: [PATCH 104/254] [mtt] fix

---
 src/search.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/src/search.c b/src/search.c
index 3cbf9b00..9850146d 100644
--- a/src/search.c
+++ b/src/search.c
@@ -78,8 +78,8 @@ static INLINE void copy_cu_info(lcu_t *from, lcu_t *to, const cu_loc_t* const cu
 
 static INLINE void initialize_partial_work_tree(lcu_t* from, lcu_t *to, const cu_loc_t * const cu_loc, enum uvg_tree_type tree_type) {
 
-  const int y_limit = MAX((cu_loc->local_y + cu_loc->height), LCU_WIDTH) >> (tree_type == UVG_CHROMA_T);
-  const int x_limit = MAX(cu_loc->local_x + cu_loc->width, LCU_WIDTH) >> (tree_type == UVG_CHROMA_T);
+  const int y_limit = (LCU_WIDTH - cu_loc->local_y) >> (tree_type == UVG_CHROMA_T);
+  const int x_limit = (LCU_WIDTH - cu_loc->local_x) >> (tree_type == UVG_CHROMA_T);
 
   if (cu_loc->local_x == 0) {
     to->left_ref = from->left_ref;
@@ -112,6 +112,12 @@ static INLINE void initialize_partial_work_tree(lcu_t* from, lcu_t *to, const cu
   for (int x = x_start; x < x_limit; x += SCU_WIDTH) {
     *LCU_GET_CU_AT_PX(to, x, y_start) = *LCU_GET_CU_AT_PX(from, x, y_start);
   }
+
+  for (int y = cu_loc->local_y >> (tree_type == UVG_CHROMA_T); y < y_limit; y += SCU_WIDTH) {
+    for (int x = cu_loc->local_x >> (tree_type == UVG_CHROMA_T); x < x_limit; x += SCU_WIDTH) {
+      memset(LCU_GET_CU_AT_PX(to, x, y), 0, sizeof(cu_info_t));
+    }
+  }
 }
 
 static INLINE void copy_cu_pixels(
@@ -1413,6 +1419,9 @@ static double search_cu(
 
         cur_cu->intra = cu_d1->intra;
         cur_cu->type = CU_INTRA;
+        if (cur_cu->intra.mode_chroma > 79) {
+          cur_cu->intra.mode_chroma = cur_cu->intra.mode;
+        }
 
         // Disable MRL in this case
         cur_cu->intra.multi_ref_idx = 0;

From a1e7664db3ae2d22cf1c7ec44c252fa33beb13d8 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Thu, 10 Nov 2022 12:51:24 +0200
Subject: [PATCH 105/254] [mtt] temporarily disable zero coeff rdo

---
 src/search.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/search.c b/src/search.c
index 9850146d..3eda8004 100644
--- a/src/search.c
+++ b/src/search.c
@@ -1228,7 +1228,7 @@ static double search_cu(
         const bool has_chroma = state->encoder_control->chroma_format != UVG_CSP_400;
         uvg_inter_recon_cu(state, lcu, true, has_chroma, cu_loc);
 
-        if (ctrl->cfg.zero_coeff_rdo && !ctrl->cfg.lossless && !ctrl->cfg.rdoq_enable) {
+        if (ctrl->cfg.zero_coeff_rdo && !ctrl->cfg.lossless && !ctrl->cfg.rdoq_enable && false) {
           //Calculate cost for zero coeffs
           inter_zero_coeff_cost = cu_zero_coeff_cost(state, work_tree, cu_loc, split_tree.current_depth) + inter_bitcost * state->lambda;
 

From c590e5ec736ed10caa01ddd392b0b80da1931c57 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Fri, 11 Nov 2022 06:59:28 +0200
Subject: [PATCH 106/254] [mtt] also copy top right CU

---
 src/search.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/search.c b/src/search.c
index 3eda8004..1a1e50ff 100644
--- a/src/search.c
+++ b/src/search.c
@@ -83,6 +83,7 @@ static INLINE void initialize_partial_work_tree(lcu_t* from, lcu_t *to, const cu
 
   if (cu_loc->local_x == 0) {
     to->left_ref = from->left_ref;
+    *LCU_GET_TOP_RIGHT_CU(to) = *LCU_GET_TOP_RIGHT_CU(from);
   }
   if (cu_loc->local_y == 0) {
     to->top_ref = from->top_ref;

From e931c096dba84d0186f9c8a9c4cf7e668238fc23 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Fri, 11 Nov 2022 08:38:37 +0200
Subject: [PATCH 107/254] [mtt] fix

---
 src/search.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/search.c b/src/search.c
index 1a1e50ff..8acdd4b1 100644
--- a/src/search.c
+++ b/src/search.c
@@ -76,7 +76,7 @@ static INLINE void copy_cu_info(lcu_t *from, lcu_t *to, const cu_loc_t* const cu
 }
 
 
-static INLINE void initialize_partial_work_tree(lcu_t* from, lcu_t *to, const cu_loc_t * const cu_loc, enum uvg_tree_type tree_type) {
+static INLINE void initialize_partial_work_tree(lcu_t* from, lcu_t *to, const cu_loc_t * const cu_loc, const enum uvg_tree_type tree_type) {
 
   const int y_limit = (LCU_WIDTH - cu_loc->local_y) >> (tree_type == UVG_CHROMA_T);
   const int x_limit = (LCU_WIDTH - cu_loc->local_x) >> (tree_type == UVG_CHROMA_T);
@@ -87,6 +87,7 @@ static INLINE void initialize_partial_work_tree(lcu_t* from, lcu_t *to, const cu
   }
   if (cu_loc->local_y == 0) {
     to->top_ref = from->top_ref;
+    *LCU_GET_TOP_RIGHT_CU(to) = *LCU_GET_TOP_RIGHT_CU(from);
   }
 
   to->ref.chroma_format = from->ref.chroma_format;
@@ -105,12 +106,11 @@ static INLINE void initialize_partial_work_tree(lcu_t* from, lcu_t *to, const cu
 
   const int y_start = (cu_loc->local_y >> (tree_type == UVG_CHROMA_T)) - 4;
   const int x_start = (cu_loc->local_x >> (tree_type == UVG_CHROMA_T)) - 4;
-  for (int y = y_start; y < y_limit; y += SCU_WIDTH) {
-    const int temp = LCU_CU_OFFSET + ((x_start) >> 2) + ((y) >> 2) * LCU_T_CU_WIDTH;
+  for (int y = y_start; y < (tree_type != UVG_CHROMA_T ? LCU_WIDTH : LCU_WIDTH_C); y += SCU_WIDTH) {
     *LCU_GET_CU_AT_PX(to, x_start, y) = *LCU_GET_CU_AT_PX(from, x_start, y);
 
   }
-  for (int x = x_start; x < x_limit; x += SCU_WIDTH) {
+  for (int x = x_start; x < (tree_type != UVG_CHROMA_T ? LCU_WIDTH : LCU_WIDTH_C); x += SCU_WIDTH) {
     *LCU_GET_CU_AT_PX(to, x, y_start) = *LCU_GET_CU_AT_PX(from, x, y_start);
   }
 

From 1668b65f3fda19d058bbce9d03d7db9f358c3490 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Mon, 14 Nov 2022 08:28:32 +0200
Subject: [PATCH 108/254] [mtt] fix

---
 src/inter.c  | 6 +++---
 src/search.c | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/inter.c b/src/inter.c
index be353506..d275f4ea 100644
--- a/src/inter.c
+++ b/src/inter.c
@@ -708,9 +708,9 @@ void uvg_inter_pred_pu(
     const unsigned offset_chroma = SUB_SCU(cu_loc->y) / 2 * LCU_WIDTH_C + SUB_SCU(cu_loc->x) / 2;
     yuv_t lcu_adapter;
     lcu_adapter.size = cu_loc->width * cu_loc->height;
-    lcu_adapter.y = lcu->rec.y + offset_luma,
-    lcu_adapter.u = lcu->rec.u + offset_chroma,
-    lcu_adapter.v = lcu->rec.v + offset_chroma,
+    lcu_adapter.y = lcu->rec.y + offset_luma;
+    lcu_adapter.u = lcu->rec.u + offset_chroma;
+    lcu_adapter.v = lcu->rec.v + offset_chroma;
 
     inter_recon_unipred(state,
                         ref,
diff --git a/src/search.c b/src/search.c
index 8acdd4b1..f74a0f75 100644
--- a/src/search.c
+++ b/src/search.c
@@ -545,11 +545,11 @@ double uvg_cu_rd_cost_chroma(
     uvg_cu_loc_ctor(&chroma_loc, lcu_px.x, lcu_px.y, cu_loc->width, cu_loc->height);
 
     if((pred_cu->joint_cb_cr & 3) == 0){
-      coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.u, NULL, cu_loc, 2, scan_order, 0, COEFF_ORDER_CU);
-      coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.v, NULL, cu_loc, 2, scan_order, 0, COEFF_ORDER_CU);
+      coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.u, NULL, &chroma_loc, 2, scan_order, 0, COEFF_ORDER_CU);
+      coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.v, NULL, &chroma_loc, 2, scan_order, 0, COEFF_ORDER_CU);
     }
     else {
-      coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.joint_uv, NULL, cu_loc, 2, scan_order, 0, COEFF_ORDER_CU);
+      coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.joint_uv, NULL, &chroma_loc, 2, scan_order, 0, COEFF_ORDER_CU);
       
     }
   }

From cf5f7658a0ec803124f0d8328e8b11c233abcbe4 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Mon, 14 Nov 2022 09:54:11 +0200
Subject: [PATCH 109/254] [mtt] fix

---
 src/search.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/search.c b/src/search.c
index f74a0f75..9d9e7d14 100644
--- a/src/search.c
+++ b/src/search.c
@@ -78,8 +78,8 @@ static INLINE void copy_cu_info(lcu_t *from, lcu_t *to, const cu_loc_t* const cu
 
 static INLINE void initialize_partial_work_tree(lcu_t* from, lcu_t *to, const cu_loc_t * const cu_loc, const enum uvg_tree_type tree_type) {
 
-  const int y_limit = (LCU_WIDTH - cu_loc->local_y) >> (tree_type == UVG_CHROMA_T);
-  const int x_limit = (LCU_WIDTH - cu_loc->local_x) >> (tree_type == UVG_CHROMA_T);
+  const int y_limit = LCU_WIDTH >> (tree_type == UVG_CHROMA_T);
+  const int x_limit = LCU_WIDTH >> (tree_type == UVG_CHROMA_T);
 
   if (cu_loc->local_x == 0) {
     to->left_ref = from->left_ref;
@@ -106,11 +106,11 @@ static INLINE void initialize_partial_work_tree(lcu_t* from, lcu_t *to, const cu
 
   const int y_start = (cu_loc->local_y >> (tree_type == UVG_CHROMA_T)) - 4;
   const int x_start = (cu_loc->local_x >> (tree_type == UVG_CHROMA_T)) - 4;
-  for (int y = y_start; y < (tree_type != UVG_CHROMA_T ? LCU_WIDTH : LCU_WIDTH_C); y += SCU_WIDTH) {
+  for (int y = y_start; y < y_limit; y += SCU_WIDTH) {
     *LCU_GET_CU_AT_PX(to, x_start, y) = *LCU_GET_CU_AT_PX(from, x_start, y);
 
   }
-  for (int x = x_start; x < (tree_type != UVG_CHROMA_T ? LCU_WIDTH : LCU_WIDTH_C); x += SCU_WIDTH) {
+  for (int x = x_start; x < x_limit; x += SCU_WIDTH) {
     *LCU_GET_CU_AT_PX(to, x, y_start) = *LCU_GET_CU_AT_PX(from, x, y_start);
   }
 

From 536c0ff2ef7dc5cacbfe43978bdf1df1959c6141 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Mon, 14 Nov 2022 10:16:25 +0200
Subject: [PATCH 110/254] [quant] fix fast coeff cost

---
 src/rdo.c | 50 +++++++++++++++++++++++++-------------------------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/src/rdo.c b/src/rdo.c
index e3b3bff6..d3ab8aef 100644
--- a/src/rdo.c
+++ b/src/rdo.c
@@ -301,35 +301,17 @@ static INLINE double get_coeff_cabac_cost(
   color_t color,
   int8_t scan_mode,
   int8_t tr_skip,
-  cu_info_t* cur_tu,
-  int coeff_order)
+  cu_info_t* cur_tu)
 {
   const int width  = cu_loc->width;
   const int height = cu_loc->height;
   const int sub_coeff_w = color == COLOR_Y ? cu_loc->width  : cu_loc->chroma_width;
   const int sub_coeff_h = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
-  const int lcu_width = color == COLOR_Y ? LCU_WIDTH : LCU_WIDTH_C;
-
-  int x_local = cu_loc->x % LCU_WIDTH;
-  int y_local = cu_loc->y % LCU_WIDTH;
 
   // Make sure there are coeffs present
   bool found = false;
-
-  coeff_t* coeff_ptr = NULL;
-  coeff_t sub_coeff[TR_MAX_WIDTH * TR_MAX_WIDTH];
-
-  if (coeff_order == COEFF_ORDER_LINEAR) {
-    coeff_ptr = (coeff_t*)coeff;
-  }
-  else {
-    // Coeff order CU
-    uvg_get_sub_coeff(sub_coeff, coeff, x_local, y_local, sub_coeff_w, sub_coeff_h, lcu_width);
-    coeff_ptr = sub_coeff;
-  }
-
   for (int i = 0; i < sub_coeff_w * sub_coeff_h; i++) {
-    if (coeff_ptr[i] != 0) {
+    if (coeff[i] != 0) {
       found = 1;
       break;
     }
@@ -352,7 +334,7 @@ static INLINE double get_coeff_cabac_cost(
   if(!tr_skip) {
     uvg_encode_coeff_nxn((encoder_state_t*) state,
                          &cabac_copy,
-                         coeff_ptr,
+                         coeff,
                          cu_loc,
                          color,
                          scan_mode,
@@ -362,7 +344,7 @@ static INLINE double get_coeff_cabac_cost(
   else {
     uvg_encode_ts_residual((encoder_state_t* const)state,
       &cabac_copy,
-      coeff_ptr,
+      coeff,
       width,
       height,
       color,
@@ -426,6 +408,24 @@ double uvg_get_coeff_cost(
 
   const int width  = color == COLOR_Y ? cu_loc->width  : cu_loc->chroma_width;
   const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
+  int x_local = cu_loc->x % LCU_WIDTH;
+  int y_local = cu_loc->y % LCU_WIDTH;
+  const int sub_coeff_w = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
+  const int sub_coeff_h = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
+  const int lcu_width = color == COLOR_Y ? LCU_WIDTH : LCU_WIDTH_C;
+
+
+  const coeff_t* coeff_ptr = NULL;
+  coeff_t sub_coeff[TR_MAX_WIDTH * TR_MAX_WIDTH];
+
+  if (coeff_order == COEFF_ORDER_LINEAR) {
+    coeff_ptr = coeff;
+  }
+  else {
+    // Coeff order CU
+    uvg_get_sub_coeff(sub_coeff, coeff, x_local, y_local, sub_coeff_w, sub_coeff_h, lcu_width);
+    coeff_ptr = sub_coeff;
+  }
 
   if (state->qp < state->encoder_control->cfg.fast_residual_cost_limit &&
       state->qp < MAX_FAST_COEFF_COST_QP && !tr_skip) {
@@ -437,15 +437,15 @@ double uvg_get_coeff_cost(
       return UINT32_MAX; // Hush little compiler don't you cry, not really gonna return anything after assert(0)
     } else {
       uint64_t weights = uvg_fast_coeff_get_weights(state);
-      uint32_t fast_cost = uvg_fast_coeff_cost(coeff, width, height, weights);
+      uint32_t fast_cost = uvg_fast_coeff_cost(coeff_ptr, width, height, weights);
       if (check_accuracy) {
-        double ccc = get_coeff_cabac_cost(state, coeff, cu_loc, color, scan_mode, tr_skip, cur_tu, coeff_order);
+        double ccc = get_coeff_cabac_cost(state, coeff_ptr, cu_loc, color, scan_mode, tr_skip, cur_tu);
         save_accuracy(state->qp, ccc, fast_cost);
       }
       return fast_cost;
     }
   } else {
-    double ccc = get_coeff_cabac_cost(state, coeff, cu_loc, color, scan_mode, tr_skip, cur_tu, coeff_order);
+    double ccc = get_coeff_cabac_cost(state, coeff_ptr, cu_loc, color, scan_mode, tr_skip, cur_tu);
     if (save_cccs) {
       save_ccc(state->qp, coeff, width * width, ccc);
     }

From 03b91992a3bfb889f9212eba596571c310774aa8 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Mon, 14 Nov 2022 11:24:43 +0200
Subject: [PATCH 111/254] [mtt] fix dual tree

---
 src/encode_coding_tree.c | 4 ++--
 src/search.c             | 5 +++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index dbcf9abe..67676dda 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -561,7 +561,7 @@ static void encode_transform_unit(
   }
 
   bool joint_chroma = cur_pu->joint_cb_cr != 0;
-  if (cur_pu->log2_height + cur_pu->log2_width < 6) {
+  if (cur_pu->log2_height + cur_pu->log2_width < 6 && tree_type != UVG_CHROMA_T) {
     // For size 4x4 luma transform the corresponding chroma transforms are
     // also of size 4x4 covering 8x8 luma pixels. The residual is coded in
     // the last transform unit.
@@ -1772,7 +1772,7 @@ double uvg_mock_encode_coding_unit(
     if(tree_type != UVG_CHROMA_T) {
       uvg_encode_intra_luma_coding_unit(state, cabac, cur_cu, cu_loc, lcu, &bits);
     }
-    if((cur_cu->log2_height + cur_cu->log2_width >= 6 || (x % 8 != 0 && y % 8 != 0)) && state->encoder_control->chroma_format != UVG_CSP_400 && tree_type != UVG_LUMA_T) {
+    if((cur_cu->log2_height + cur_cu->log2_width >= 6 || (x % 8 != 0 && y % 8 != 0) || tree_type == UVG_CHROMA_T) && state->encoder_control->chroma_format != UVG_CSP_400 && tree_type != UVG_LUMA_T) {
       encode_chroma_intra_cu(cabac, cur_cu, state->encoder_control->cfg.cclm, &bits);
     }
   }
diff --git a/src/search.c b/src/search.c
index 9d9e7d14..dd78a69d 100644
--- a/src/search.c
+++ b/src/search.c
@@ -1080,7 +1080,7 @@ static double search_cu(
         int8_t intra_mode = intra_search.pred_cu.intra.mode;
 
         // TODO: This heavily relies to square CUs
-        if ((cur_cu->log2_height + cur_cu->log2_width >= 6 || (x % 8 && y % 8))
+        if ((cur_cu->log2_height + cur_cu->log2_width >= 6 || (x % 8 && y % 8) || tree_type == UVG_CHROMA_T)
           && state->encoder_control->chroma_format != UVG_CSP_400 && tree_type != UVG_LUMA_T) {
 
           intra_search.pred_cu.joint_cb_cr = 0;
@@ -1176,7 +1176,8 @@ static double search_cu(
                          recon_luma, recon_chroma);
 
 
-      if(cur_cu->log2_height + cur_cu->log2_width < 6 && x % 8 && y % 8 && tree_type != UVG_LUMA_T && state->encoder_control->chroma_format != UVG_CSP_400) {
+      if((cur_cu->log2_height + cur_cu->log2_width < 6 && x % 8 && y % 8 && tree_type != UVG_LUMA_T && state->encoder_control->chroma_format != UVG_CSP_400 ) 
+        || tree_type == UVG_CHROMA_T) {
         intra_search.pred_cu.intra.mode_chroma = cur_cu->intra.mode_chroma;
         uvg_intra_recon_cu(state,
                            &intra_search, cu_loc,

From f2abdd64247cbb0fd95e5fc19a2d770d4a44b8a8 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Tue, 15 Nov 2022 07:48:02 +0200
Subject: [PATCH 112/254] [mtt] Remove work_tree_copy_down and change
 work_tree_copy_up not to require the whole work tree as input parameter

---
 src/search.c | 58 +++++++++++++++++++++++++++++-----------------------
 1 file changed, 32 insertions(+), 26 deletions(-)

diff --git a/src/search.c b/src/search.c
index dd78a69d..1fe41b8e 100644
--- a/src/search.c
+++ b/src/search.c
@@ -85,10 +85,35 @@ static INLINE void initialize_partial_work_tree(lcu_t* from, lcu_t *to, const cu
     to->left_ref = from->left_ref;
     *LCU_GET_TOP_RIGHT_CU(to) = *LCU_GET_TOP_RIGHT_CU(from);
   }
+  else {
+    if(tree_type != UVG_CHROMA_T) {
+      uvg_pixels_blit(from->rec.y, to->rec.y, cu_loc->local_x, LCU_WIDTH, LCU_WIDTH, LCU_WIDTH);
+    }
+    if(tree_type != UVG_LUMA_T && from->ref.chroma_format != UVG_CSP_400) {
+      uvg_pixels_blit(from->rec.u, to->rec.u, cu_loc->local_x / 2, LCU_WIDTH_C, LCU_WIDTH_C, LCU_WIDTH_C);
+      uvg_pixels_blit(from->rec.v, to->rec.v, cu_loc->local_x / 2, LCU_WIDTH_C, LCU_WIDTH_C, LCU_WIDTH_C);
+    }
+  }
+
   if (cu_loc->local_y == 0) {
     to->top_ref = from->top_ref;
     *LCU_GET_TOP_RIGHT_CU(to) = *LCU_GET_TOP_RIGHT_CU(from);
   }
+  else {
+    if (tree_type != UVG_CHROMA_T) {
+      uvg_pixels_blit(&from->rec.y[cu_loc->local_x], &to->rec.y[cu_loc->local_x], 
+        LCU_WIDTH - cu_loc->local_x, cu_loc->local_y,
+        LCU_WIDTH, LCU_WIDTH);
+    }
+    if (tree_type != UVG_LUMA_T && from->ref.chroma_format != UVG_CSP_400) {
+      uvg_pixels_blit(&from->rec.u[cu_loc->local_x / 2], &to->rec.u[cu_loc->local_x / 2],
+        LCU_WIDTH_C - cu_loc->local_x / 2, cu_loc->local_y / 2,
+        LCU_WIDTH_C, LCU_WIDTH_C);
+      uvg_pixels_blit(&from->rec.v[cu_loc->local_x / 2], &to->rec.v[cu_loc->local_x / 2],
+        LCU_WIDTH_C - cu_loc->local_x / 2, cu_loc->local_y / 2,
+        LCU_WIDTH_C, LCU_WIDTH_C);
+    }
+  }
 
   to->ref.chroma_format = from->ref.chroma_format;
   to->rec.chroma_format = from->rec.chroma_format;
@@ -174,36 +199,19 @@ static INLINE void copy_cu_coeffs(const cu_loc_t *cu_loc, lcu_t *from, lcu_t *to
  * Copy all non-reference CU data from next level to current level.
  */
 static void work_tree_copy_up(
-  lcu_t *work_tree,
+  lcu_t *from,
+  lcu_t* to,
   bool joint,
   enum
   uvg_tree_type tree_type,
-  const cu_loc_t* const cu_loc,
-  const int depth)
-{
-  copy_cu_info  (&work_tree[depth + 1], &work_tree[depth], cu_loc, tree_type);
-  copy_cu_pixels(&work_tree[depth + 1], &work_tree[depth], cu_loc, tree_type);
-  copy_cu_coeffs(cu_loc, &work_tree[depth + 1], &work_tree[depth], joint, tree_type);
-  
-}
-
-
-/**
- * Copy all non-reference CU data from current level to all lower levels.
- */
-static void work_tree_copy_down(
-  int depth,
-  lcu_t *work_tree,
-  enum uvg_tree_type
-  tree_type,
   const cu_loc_t* const cu_loc)
 {
-  for (int i = depth + 1; i <= MAX_PU_DEPTH; i++) {
-    copy_cu_info  (&work_tree[depth], &work_tree[i], cu_loc, tree_type);
-    copy_cu_pixels(&work_tree[depth], &work_tree[i], cu_loc, tree_type);
-  }
+  copy_cu_info  (from, to, cu_loc, tree_type);
+  copy_cu_pixels(from, to, cu_loc, tree_type);
+  copy_cu_coeffs(cu_loc, from, to, joint, tree_type);  
 }
 
+
 static void lcu_fill_cu_info(lcu_t *lcu, int x_local, int y_local, int width, int height, const cu_info_t *cu)
 {
   // Set mode in every CU covered by part_mode in this depth.
@@ -1457,7 +1465,7 @@ static double search_cu(
     if (split_cost < cost) {
       // Copy split modes to this depth.
       cost = split_cost;
-      work_tree_copy_up(work_tree, state->encoder_control->cfg.jccr, tree_type, cu_loc, depth);
+      work_tree_copy_up(&work_tree[depth + 1], &work_tree[depth], state->encoder_control->cfg.jccr, tree_type, cu_loc);
 #if UVG_DEBUG
       //debug_split = 1;
 #endif
@@ -1465,7 +1473,6 @@ static double search_cu(
       // Copy this CU's mode all the way down for use in adjacent CUs mode
       // search.
       memcpy(&state->search_cabac, &post_seach_cabac, sizeof(post_seach_cabac));
-      work_tree_copy_down(depth, work_tree, tree_type, cu_loc);
       downsample_cclm_rec(
         state, x, y, cu_width / 2, cu_width / 2, lcu->rec.y, lcu->left_ref.y[64]
       );
@@ -1492,7 +1499,6 @@ static double search_cu(
   } else if (cur_cu->log2_height + cur_cu->log2_width > 4) {
     // Need to copy modes down since the lower level of the work tree is used
     // when searching SMP and AMP blocks.
-    work_tree_copy_down(depth, work_tree, tree_type, cu_loc);
     if(tree_type != UVG_CHROMA_T) {
       downsample_cclm_rec(
         state, x, y, cu_width / 2, cu_width / 2, lcu->rec.y, lcu->left_ref.y[64]

From 02a5adf76820287e3f2d8e0fb7b9a5ddba69ecf1 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Tue, 15 Nov 2022 08:35:47 +0200
Subject: [PATCH 113/254] [mtt] remove work_tree

---
 src/search.c | 74 +++++++++++++++++++++++++++-------------------------
 1 file changed, 39 insertions(+), 35 deletions(-)

diff --git a/src/search.c b/src/search.c
index 1fe41b8e..2eead001 100644
--- a/src/search.c
+++ b/src/search.c
@@ -115,6 +115,11 @@ static INLINE void initialize_partial_work_tree(lcu_t* from, lcu_t *to, const cu
     }
   }
 
+  if (tree_type == UVG_CHROMA_T) {
+    // These are needed for CCLM
+    uvg_pixels_blit(from->rec.y, to->rec.y, MIN(cu_loc->local_x + cu_loc->width * 2, LCU_WIDTH), MIN(cu_loc->local_y + cu_loc->height * 2, LCU_WIDTH), LCU_WIDTH, LCU_WIDTH);
+  }
+
   to->ref.chroma_format = from->ref.chroma_format;
   to->rec.chroma_format = from->rec.chroma_format;
 
@@ -925,7 +930,7 @@ void uvg_sort_keys_by_cost(unit_stats_map_t *__restrict map)
 static double search_cu(
   encoder_state_t* const state,
   const cu_loc_t* const cu_loc,
-  lcu_t* work_tree,
+  lcu_t* lcu,
   enum uvg_tree_type
   tree_type,
   const split_tree_t split_tree)
@@ -962,9 +967,7 @@ static double search_cu(
     int32_t min;
     int32_t max;
   } pu_depth_inter, pu_depth_intra;
-
-  lcu_t *const lcu = &work_tree[split_tree.current_depth];
-
+  
   int x_local = SUB_SCU(x) >> (tree_type == UVG_CHROMA_T);
   int y_local = SUB_SCU(y) >> (tree_type == UVG_CHROMA_T);
 
@@ -1240,7 +1243,7 @@ static double search_cu(
 
         if (ctrl->cfg.zero_coeff_rdo && !ctrl->cfg.lossless && !ctrl->cfg.rdoq_enable && false) {
           //Calculate cost for zero coeffs
-          inter_zero_coeff_cost = cu_zero_coeff_cost(state, work_tree, cu_loc, split_tree.current_depth) + inter_bitcost * state->lambda;
+          // inter_zero_coeff_cost = cu_zero_coeff_cost(state, work_tree, cu_loc, split_tree.current_depth) + inter_bitcost * state->lambda;
 
         }
         cu_loc_t loc;
@@ -1296,21 +1299,21 @@ static double search_cu(
 
     cost += cu_rd_cost_tr_split_accurate(state, cur_cu, lcu, tree_type, 0, cu_loc);
     
-    if (ctrl->cfg.zero_coeff_rdo && inter_zero_coeff_cost <= cost) {
-      cost = inter_zero_coeff_cost;
+    //if (ctrl->cfg.zero_coeff_rdo && inter_zero_coeff_cost <= cost) {
+    //  cost = inter_zero_coeff_cost;
 
-      // Restore saved pixels from lower level of the working tree.
-      copy_cu_pixels(&work_tree[split_tree.current_depth + 1], lcu, cu_loc, tree_type);
+    //  // Restore saved pixels from lower level of the working tree.
+    //  copy_cu_pixels(&work_tree[split_tree.current_depth + 1], lcu, cu_loc, tree_type);
 
-      if (cur_cu->merged) {
-        cur_cu->merged = 0;
-        cur_cu->skipped = 1;
-        lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu);
-      }
+    //  if (cur_cu->merged) {
+    //    cur_cu->merged = 0;
+    //    cur_cu->skipped = 1;
+    //    lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu);
+    //  }
 
-      cur_cu->cbf = 0;
-      lcu_fill_cbf(lcu, x_local, y_local, cu_width, cur_cu);
-    }
+    //  cur_cu->cbf = 0;
+    //  lcu_fill_cbf(lcu, x_local, y_local, cu_width, cur_cu);
+    //}
     cabac->update = 0;
   } 
 
@@ -1340,6 +1343,7 @@ static double search_cu(
 
 
     double split_bits = 0;
+    lcu_t split_lcu;
 
     if (cur_cu->log2_height + cur_cu->log2_width > 4) {
 
@@ -1383,20 +1387,20 @@ static double search_cu(
     // It is ok to interrupt the search as soon as it is known that
     // the split costs at least as much as not splitting.
     if (cur_cu->type == CU_NOTSET || cbf || state->encoder_control->cfg.cu_split_termination == UVG_CU_SPLIT_TERMINATION_OFF) {
-      initialize_partial_work_tree(&work_tree[depth], &work_tree[depth + 1], cu_loc, tree_type);
+      initialize_partial_work_tree(lcu, &split_lcu, cu_loc, tree_type);
       cu_loc_t new_cu_loc[4];
       uvg_get_split_locs(cu_loc, QT_SPLIT, new_cu_loc);
       if (split_cost < cost) {
-        split_cost += search_cu(state, &new_cu_loc[0], work_tree, tree_type, new_split);
+        split_cost += search_cu(state, &new_cu_loc[0], &split_lcu, tree_type, new_split);
       }
       if (split_cost < cost) {
-        split_cost += search_cu(state, &new_cu_loc[1], work_tree, tree_type, new_split);
+        split_cost += search_cu(state, &new_cu_loc[1], &split_lcu, tree_type, new_split);
       }
       if (split_cost < cost) {
-        split_cost += search_cu(state, &new_cu_loc[2], work_tree, tree_type, new_split);
+        split_cost += search_cu(state, &new_cu_loc[2], &split_lcu, tree_type, new_split);
       }
       if (split_cost < cost) {
-        split_cost += search_cu(state, &new_cu_loc[3], work_tree, tree_type, new_split);
+        split_cost += search_cu(state, &new_cu_loc[3], &split_lcu, tree_type, new_split);
       }
     } else {
       split_cost = INT_MAX;
@@ -1414,7 +1418,7 @@ static double search_cu(
       && tree_type == UVG_BOTH_T)
     {
 
-      cu_info_t *cu_d1 = LCU_GET_CU_AT_PX(&work_tree[depth + 1], x_local, y_local);
+      cu_info_t *cu_d1 = LCU_GET_CU_AT_PX(&split_lcu, x_local, y_local);
 
       // If the best CU in depth+1 is intra and the biggest it can be, try it.
       if (cu_d1->type == CU_INTRA && (cu_d1->log2_height + 1 == cur_cu->log2_height || cu_d1->log2_width + 1 == cur_cu->log2_width)) {
@@ -1465,7 +1469,7 @@ static double search_cu(
     if (split_cost < cost) {
       // Copy split modes to this depth.
       cost = split_cost;
-      work_tree_copy_up(&work_tree[depth + 1], &work_tree[depth], state->encoder_control->cfg.jccr, tree_type, cu_loc);
+      work_tree_copy_up(&split_lcu, lcu, state->encoder_control->cfg.jccr, tree_type, cu_loc);
 #if UVG_DEBUG
       //debug_split = 1;
 #endif
@@ -1698,15 +1702,15 @@ void uvg_search_lcu(encoder_state_t * const state, const int x, const int y, con
   // will use these as temporary storage for predictions before making
   // a decision on which to use, and they get updated during the search
   // process.
-  lcu_t work_tree[MAX_PU_DEPTH + 1];
-  init_lcu_t(state, x, y, &work_tree[0], hor_buf, ver_buf);
+  lcu_t work_tree;
+  init_lcu_t(state, x, y, &work_tree, hor_buf, ver_buf);
 
   // If the ML depth prediction is enabled, 
   // generate the depth prediction interval 
   // for the current lcu
   constraint_t* constr = state->constraint;
   if (constr->ml_intra_depth_ctu) {
-    uvg_lcu_luma_depth_pred(constr->ml_intra_depth_ctu, work_tree[0].ref.y, state->qp);
+    uvg_lcu_luma_depth_pred(constr->ml_intra_depth_ctu, work_tree.ref.y, state->qp);
   }
 
   int tree_type = state->frame->slicetype == UVG_SLICE_I
@@ -1719,7 +1723,7 @@ void uvg_search_lcu(encoder_state_t * const state, const int x, const int y, con
   double cost = search_cu(
     state, 
     &start,
-    work_tree,
+    &work_tree,
     tree_type,
     split_tree);
 
@@ -1730,26 +1734,26 @@ void uvg_search_lcu(encoder_state_t * const state, const int x, const int y, con
 
   // The best decisions through out the LCU got propagated back to depth 0,
   // so copy those back to the frame.
-  copy_lcu_to_cu_data(state, x, y, &work_tree[0], tree_type);
+  copy_lcu_to_cu_data(state, x, y, &work_tree, tree_type);
 
   // Copy coeffs to encoder state.
-  copy_coeffs(work_tree[0].coeff.y, coeff->y, LCU_WIDTH, LCU_WIDTH, LCU_WIDTH);
+  copy_coeffs(work_tree.coeff.y, coeff->y, LCU_WIDTH, LCU_WIDTH, LCU_WIDTH);
 
   if(state->frame->slicetype == UVG_SLICE_I && state->encoder_control->cfg.dual_tree) {
     cost = search_cu(
       state, &start,
-      work_tree,
+      &work_tree,
       UVG_CHROMA_T, split_tree);
 
     if (state->encoder_control->cfg.rc_algorithm == UVG_LAMBDA) {
       uvg_get_lcu_stats(state, x / LCU_WIDTH, y / LCU_WIDTH)->weight += cost * cost;
     }
-    copy_lcu_to_cu_data(state, x, y, &work_tree[0], UVG_CHROMA_T);
+    copy_lcu_to_cu_data(state, x, y, &work_tree, UVG_CHROMA_T);
   }
 
-  copy_coeffs(work_tree[0].coeff.u, coeff->u, LCU_WIDTH_C, LCU_WIDTH_C, LCU_WIDTH_C);
-  copy_coeffs(work_tree[0].coeff.v, coeff->v, LCU_WIDTH_C, LCU_WIDTH_C, LCU_WIDTH_C);
+  copy_coeffs(work_tree.coeff.u, coeff->u, LCU_WIDTH_C, LCU_WIDTH_C, LCU_WIDTH_C);
+  copy_coeffs(work_tree.coeff.v, coeff->v, LCU_WIDTH_C, LCU_WIDTH_C, LCU_WIDTH_C);
   if (state->encoder_control->cfg.jccr) {
-    copy_coeffs(work_tree[0].coeff.joint_uv, coeff->joint_uv, LCU_WIDTH_C, LCU_WIDTH_C, LCU_WIDTH_C);
+    copy_coeffs(work_tree.coeff.joint_uv, coeff->joint_uv, LCU_WIDTH_C, LCU_WIDTH_C, LCU_WIDTH_C);
   }
 }

From bbbd391b9e3f767d4a6b3bc1c44f36960e41d439 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Tue, 15 Nov 2022 15:31:44 +0200
Subject: [PATCH 114/254] [mtt] WIP

---
 src/cu.c                               |  16 +--
 src/cu.h                               |   5 +-
 src/encode_coding_tree.c               |  34 ++-----
 src/encoder_state-bitstream.c          |  51 ++++------
 src/global.h                           |   2 +-
 src/intra.c                            |   4 +-
 src/rdo.c                              |   7 +-
 src/search.c                           |  53 +++++-----
 src/search_inter.c                     |   4 +-
 src/search_intra.c                     |  97 ++++++++++--------
 src/strategies/avx2/quant-avx2.c       |   4 +-
 src/strategies/generic/quant-generic.c |  12 +--
 src/strategies/strategies-picture.c    | 133 ++++++++++++++-----------
 src/strategies/strategies-picture.h    |   4 +-
 src/transform.c                        |   2 +-
 src/uvg266.h                           |   9 ++
 tools/generate_tables.c                |   2 +-
 17 files changed, 225 insertions(+), 214 deletions(-)

diff --git a/src/cu.c b/src/cu.c
index 4ae74da0..0256bd3d 100644
--- a/src/cu.c
+++ b/src/cu.c
@@ -317,7 +317,10 @@ void uvg_cu_loc_ctor(cu_loc_t* loc, int x, int y, int width, int height)
 }
 
 
-void uvg_get_split_locs(const cu_loc_t* const origin, enum split_type split, cu_loc_t out[4])
+int uvg_get_split_locs(
+  const cu_loc_t* const origin,
+  enum split_type split,
+  cu_loc_t out[4])
 {
   const int half_width = origin->width >> 1;
   const int half_height = origin->height >> 1;
@@ -333,24 +336,25 @@ void uvg_get_split_locs(const cu_loc_t* const origin, enum split_type split, cu_
       uvg_cu_loc_ctor(&out[1], origin->x + half_width, origin->y, half_width, half_height);
       uvg_cu_loc_ctor(&out[2], origin->x, origin->y + half_height, half_width, half_height);
       uvg_cu_loc_ctor(&out[3], origin->x + half_width, origin->y + half_height, half_width, half_height);
-    break;
+      return 4;
     case BT_HOR_SPLIT:
       uvg_cu_loc_ctor(&out[0], origin->x, origin->y, origin->width, half_height);
       uvg_cu_loc_ctor(&out[1], origin->x, origin->y + half_height, origin->width, half_height);
-    break;
+      return 2;
     case BT_VER_SPLIT:
       uvg_cu_loc_ctor(&out[0], origin->x, origin->y, half_width, origin->height);
       uvg_cu_loc_ctor(&out[1], origin->x + half_width, origin->y, half_width, origin->height);
-    break;
+      return 2;
     case TT_HOR_SPLIT:
       uvg_cu_loc_ctor(&out[0], origin->x, origin->y, origin->width, quarter_height);
       uvg_cu_loc_ctor(&out[1], origin->x, origin->y + quarter_height, origin->width, half_height);
       uvg_cu_loc_ctor(&out[2], origin->x, origin->y + quarter_height + half_height, origin->width, quarter_height);
-    break;
+      return 3;
     case TT_VER_SPLIT:
       uvg_cu_loc_ctor(&out[0], origin->x, origin->y, quarter_width, origin->height);
       uvg_cu_loc_ctor(&out[1], origin->x + quarter_width, origin->y, half_width, origin->height);
       uvg_cu_loc_ctor(&out[2], origin->x + quarter_width + half_width, origin->y, quarter_width, origin->height);
-    break;
+      return 3;
   }
+  return 0;
 }
\ No newline at end of file
diff --git a/src/cu.h b/src/cu.h
index f9021b3c..7f1bd0e3 100644
--- a/src/cu.h
+++ b/src/cu.h
@@ -185,7 +185,10 @@ typedef struct {
 
 void uvg_cu_loc_ctor(cu_loc_t *loc, int x, int y, int width, int height);
 
-void uvg_get_split_locs(const cu_loc_t* const origin, enum split_type split, cu_loc_t out[4]);
+int uvg_get_split_locs(
+  const cu_loc_t* const origin,
+  enum split_type split,
+  cu_loc_t out[4]);
 
 #define CU_GET_MV_CAND(cu_info_ptr, reflist) \
   (((reflist) == 0) ? (cu_info_ptr)->inter.mv_cand0 : (cu_info_ptr)->inter.mv_cand1)
diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index 67676dda..4468390c 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -543,7 +543,7 @@ static void encode_transform_unit(
     if(state->encoder_control->cfg.trskip_enable && width <= (1 << state->encoder_control->cfg.trskip_max_size) && !(cur_pu->type == CU_INTRA && cur_pu->intra.isp_mode != ISP_MODE_NO_ISP)) {
       cabac->cur_ctx = &cabac->ctx.transform_skip_model_luma;
       CABAC_BIN(cabac, cur_pu->tr_idx == MTS_SKIP, "transform_skip_flag");
-      DBG_YUVIEW_VALUE(state->frame->poc, DBG_YUVIEW_TR_SKIP, x, y, width, width, (cur_pu->tr_idx == MTS_SKIP) ? 1 : 0);
+      DBG_YUVIEW_VALUE(state->frame->poc, DBG_YUVIEW_TR_SKIP, x, y, width, height, (cur_pu->tr_idx == MTS_SKIP) ? 1 : 0);
     }
     if(cur_pu->tr_idx == MTS_SKIP) {
       uvg_encode_ts_residual(state, cabac, coeff_y, width, height, 0, scan_idx, NULL);      
@@ -1040,7 +1040,7 @@ void uvg_encode_intra_luma_coding_unit(
   int multi_ref_idx = enable_mrl ? cur_cu->intra.multi_ref_idx : 0;
   
 #ifdef UVG_DEBUG_PRINT_YUVIEW_CSV
-  if(multi_ref_idx) DBG_YUVIEW_VALUE(state->frame->poc, DBG_YUVIEW_MRL, x, y, width, width, multi_ref_idx);
+  if(multi_ref_idx) DBG_YUVIEW_VALUE(state->frame->poc, DBG_YUVIEW_MRL, x, y, width, height, multi_ref_idx);
 #endif
 
   if (cur_cu->type == CU_INTRA && (y % LCU_WIDTH) != 0 && !cur_cu->bdpcmMode && enable_mrl && !mip_flag) {
@@ -1222,7 +1222,7 @@ bool uvg_write_split_flag(
   no_split = allow_qt = bh_split = bv_split = th_split = tv_split = true;
   if (depth > MAX_DEPTH) allow_qt = false;
   // ToDo: update this when btt is actually used
-  bool allow_btt = false;// when mt_depth < MAX_BT_DEPTH
+  bool allow_btt = true;// when mt_depth < MAX_BT_DEPTH
   
   const int cu_width = tree_type != UVG_CHROMA_T ? cu_loc->width : cu_loc->chroma_width;
   const int cu_height = tree_type != UVG_CHROMA_T ? cu_loc->height : cu_loc->chroma_height;
@@ -1289,7 +1289,7 @@ bool uvg_write_split_flag(
     CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.split_flag_model[split_model]), split_flag != 0, bits, "split_flag");
   }
 
-  bool qt_split = split_flag == UVG_QUAD_SPLIT;
+  bool qt_split = split_flag == QT_SPLIT;
 
   if (!(implicit_split_mode == UVG_NO_SPLIT) && (allow_qt && allow_btt)) {
     split_model = (left_cu && GET_SPLITDATA(left_cu, depth)) + (above_cu && GET_SPLITDATA(above_cu, depth)) + (depth < 2 ? 0 : 3);
@@ -1384,28 +1384,16 @@ void uvg_encode_coding_tree(
       const int half_luma = cu_loc->width / 2;
       split_tree_t new_split_tree = { cur_cu->split_tree, split_tree.current_depth + 1 };
 
-      cu_loc_t new_cu_loc;
-      uvg_cu_loc_ctor(&new_cu_loc, x, y, half_luma, half_luma);
-      // Split blocks and remember to change x and y block positions
-      uvg_encode_coding_tree(state, coeff, tree_type, &new_cu_loc, new_split_tree);
-
-      if (!border_x || border_split_x) {
-        uvg_cu_loc_ctor(&new_cu_loc, x + half_cu, y, half_luma, half_luma);
-        uvg_encode_coding_tree(state, coeff, tree_type, &new_cu_loc, new_split_tree);
-      }
-      if (!border_y || border_split_y) {
-        uvg_cu_loc_ctor(&new_cu_loc, x, y + half_cu, half_luma, half_luma);
-        uvg_encode_coding_tree(state, coeff, tree_type, &new_cu_loc, new_split_tree);
-      }
-      if (!border || (border_split_x && border_split_y)) {
-        uvg_cu_loc_ctor(&new_cu_loc, x + half_cu, y + half_cu, half_luma, half_luma);
-        uvg_encode_coding_tree(state, coeff, tree_type, &new_cu_loc, new_split_tree);
+      cu_loc_t new_cu_loc[4];
+      const int splits = uvg_get_split_locs(cu_loc, split_flag, new_cu_loc);
+      for (int split = 0; split <splits; ++split) {
+        uvg_encode_coding_tree(state, coeff, tree_type, &new_cu_loc[split], new_split_tree);
       }
       return;
     }
   }
   
-  DBG_YUVIEW_VALUE(state->frame->poc, DBG_YUVIEW_CU_TYPE, abs_x, abs_y, cu_width, cu_width, cur_cu->type-1);
+  DBG_YUVIEW_VALUE(state->frame->poc, DBG_YUVIEW_CU_TYPE, abs_x, abs_y, cu_width, cu_height, cur_cu->type-1);
 
   if (ctrl->cfg.lossless) {
     cabac->cur_ctx = &cabac->ctx.cu_transquant_bypass;
@@ -1457,8 +1445,8 @@ void uvg_encode_coding_tree(
         }
       }
 #ifdef UVG_DEBUG_PRINT_YUVIEW_CSV
-      if (cur_cu->inter.mv_dir & 1) DBG_YUVIEW_MV(state->frame->poc, DBG_YUVIEW_MVSKIP_L0, abs_x, abs_y, cu_width, cu_width, cur_cu->inter.mv[0][0], cur_cu->inter.mv[0][1]);
-      if (cur_cu->inter.mv_dir & 2) DBG_YUVIEW_MV(state->frame->poc, DBG_YUVIEW_MVSKIP_L1, abs_x, abs_y, cu_width, cu_width, cur_cu->inter.mv[1][0], cur_cu->inter.mv[1][1]);
+      if (cur_cu->inter.mv_dir & 1) DBG_YUVIEW_MV(state->frame->poc, DBG_YUVIEW_MVSKIP_L0, abs_x, abs_y, cu_width, cu_height, cur_cu->inter.mv[0][0], cur_cu->inter.mv[0][1]);
+      if (cur_cu->inter.mv_dir & 2) DBG_YUVIEW_MV(state->frame->poc, DBG_YUVIEW_MVSKIP_L1, abs_x, abs_y, cu_width, cu_height, cur_cu->inter.mv[1][0], cur_cu->inter.mv[1][1]);
 #endif
 
       goto end;
diff --git a/src/encoder_state-bitstream.c b/src/encoder_state-bitstream.c
index ba0d32f6..1649d944 100644
--- a/src/encoder_state-bitstream.c
+++ b/src/encoder_state-bitstream.c
@@ -528,48 +528,31 @@ static void encoder_state_write_bitstream_seq_parameter_set(bitstream_t* stream,
   WRITE_UE(stream, MIN_SIZE-2, "log2_min_luma_coding_block_size_minus2"); // Min size 2^3 = 8x8
   // if(!no_partition_constraints_override_constraint_flag)
     WRITE_U(stream, 0, 1, "partition_constraints_override_enabled_flag");
-  WRITE_UE(stream, 0, "sps_log2_diff_min_qt_min_cb_intra_slice_luma");
-  WRITE_UE(stream, 0, "sps_max_mtt_hierarchy_depth_intra_slice_luma");  
-
+  WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.min_qt_size[0]] - MIN_SIZE, "sps_log2_diff_min_qt_min_cb_intra_slice_luma");
+  WRITE_UE(stream, encoder->cfg.max_intra_slice_btt_depth, "sps_max_mtt_hierarchy_depth_intra_slice_luma");
+  if (encoder->cfg.max_intra_slice_btt_depth) {
+    WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.min_qt_size[0]] - uvg_g_convert_to_log2[encoder->cfg.max_bt_size[0]], "sps_log2_diff_max_bt_min_qt_intra_slice_luma");
+    WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.min_qt_size[0]] - uvg_g_convert_to_log2[encoder->cfg.max_tt_size[0]], "sps_log2_diff_max_tt_min_qt_intra_slice_luma");
+  }
+  
   if (encoder->chroma_format != UVG_CSP_400)
   {
     WRITE_U(stream, encoder->cfg.dual_tree, 1, "qtbtt_dual_tree_intra_flag");
   }
   if (encoder->cfg.dual_tree) {
-    WRITE_UE(stream, 0, "sps_log2_diff_min_qt_min_cb_intra_slice_chroma");
-    WRITE_UE(stream, 0, "sps_max_mtt_hierarchy_depth_intra_slice_chroma");
-    if (0 /*sps_max_mtt_hierarchy_depth_intra_slice_chroma != 0*/) {
-      WRITE_UE(stream, 0, "sps_log2_diff_max_bt_min_qt_intra_slice_chroma");
-      WRITE_UE(stream, 0, "sps_log2_diff_max_tt_min_qt_intra_slice_chroma");
+    WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.min_qt_size[2]] - MIN_SIZE, "sps_log2_diff_min_qt_min_cb_intra_slice_chroma");
+    WRITE_UE(stream, encoder->cfg.max_intra_slice_btt_depth_chroma, "sps_max_mtt_hierarchy_depth_intra_slice_chroma");
+    if (encoder->cfg.max_intra_slice_btt_depth_chroma) {
+      WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.min_qt_size[2]] - uvg_g_convert_to_log2[encoder->cfg.max_bt_size[2]], "sps_log2_diff_max_bt_min_qt_intra_slice_chroma");
+      WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.min_qt_size[2]] - uvg_g_convert_to_log2[encoder->cfg.max_tt_size[2]], "sps_log2_diff_max_tt_min_qt_intra_slice_chroma");
     }
   }
-  WRITE_UE(stream, 0, "sps_log2_diff_min_qt_min_cb_inter_slice");
-  WRITE_UE(stream, 0, "sps_max_mtt_hierarchy_depth_inter_slice");  
-
-
-#if 0 // mtt depth intra
-  if (max_mtt_depth_intra != 0) {
-    WRITE_UE(stream, 0, "sps_log2_diff_max_bt_min_qt_intra_tile_group_luma");
-    WRITE_UE(stream, 0, "sps_log2_diff_max_tt_min_qt_intra_tile_group_luma");
+  WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.min_qt_size[1]] - MIN_SIZE, "sps_log2_diff_min_qt_min_cb_inter_slice");
+  WRITE_UE(stream, encoder->cfg.max_inter_slice_btt_depth, "sps_max_mtt_hierarchy_depth_inter_slice");
+  if (encoder->cfg.max_inter_slice_btt_depth != 0) {
+    WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.min_qt_size[1]] - uvg_g_convert_to_log2[encoder->cfg.max_bt_size[1]], "sps_log2_diff_max_bt_min_qt_inter_tile_group");
+    WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.min_qt_size[1]] - uvg_g_convert_to_log2[encoder->cfg.max_tt_size[1]], "sps_log2_diff_max_tt_min_qt_inter_tile_group");
   }
-#endif
-#if 0 // mtt depth inter
-  if (max_mtt_depth_inter != 0) {
-    WRITE_UE(stream, 0, "sps_log2_diff_max_bt_min_qt_inter_tile_group");
-    WRITE_UE(stream, 0, "sps_log2_diff_max_tt_min_qt_inter_tile_group");
-  }
-#endif
-#if 0 // Dual Tree
-  if (encoder->cfg.dual_i_tree) {
-    WRITE_UE(stream, 0, "sps_log2_diff_min_qt_min_cb_intra_tile_group_chroma");
-    WRITE_UE(stream, 0, "sps_max_mtt_hierarchy_depth_intra_tile_group_chroma");
-
-    if (max_mtt_depth_intra != 0) {
-      WRITE_UE(stream, 0, "sps_log2_diff_max_bt_min_qt_intra_tile_group_chroma");
-      WRITE_UE(stream, 0, "sps_log2_diff_max_tt_min_qt_intra_tile_group_chroma");
-    }
-  }
-#endif
 
   if (LCU_WIDTH > 32)
     WRITE_U(stream, (TR_MAX_LOG2_SIZE - 5) ? 1 : 0, 1, "sps_max_luma_transform_size_64_flag");
diff --git a/src/global.h b/src/global.h
index e4a11b20..87ca92ee 100644
--- a/src/global.h
+++ b/src/global.h
@@ -129,7 +129,7 @@ typedef int16_t coeff_t;
 typedef int32_t mv_t;
 
 //#define VERBOSE 1
-//#define UVG_DEBUG_PRINT_CABAC 1
+#define UVG_DEBUG_PRINT_CABAC 1
 //#define UVG_DEBUG 1
 
 //#define UVG_DEBUG_PRINT_YUVIEW_CSV 1
diff --git a/src/intra.c b/src/intra.c
index 9df9acf7..99150ef2 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -1552,7 +1552,7 @@ void uvg_intra_predict(
     }
   }
   else {
-    uvg_pixels_blit(&state->tile->frame->cclm_luma_rec[x / 2 + (y * stride) / 4], dst, width, width, stride / 2, width);
+    uvg_pixels_blit(&state->tile->frame->cclm_luma_rec[x / 2 + (y * stride) / 4], dst, width, height, stride / 2, width);
     if (!PU_IS_TU(&data->pred_cu) || data->cclm_parameters[color == COLOR_U ? 0 : 1].b <= 0) {
       predict_cclm(
         state, color, width, height, x, y, stride, intra_mode, lcu, refs, dst, 
@@ -1560,7 +1560,7 @@ void uvg_intra_predict(
         tree_type);
     }
     else {
-      linear_transform_cclm(&data->cclm_parameters[color == COLOR_U ? 0 : 1], dst, dst, width, width);
+      linear_transform_cclm(&data->cclm_parameters[color == COLOR_U ? 0 : 1], dst, dst, width, height);
     }
   }
 }
diff --git a/src/rdo.c b/src/rdo.c
index d3ab8aef..26f31634 100644
--- a/src/rdo.c
+++ b/src/rdo.c
@@ -447,7 +447,7 @@ double uvg_get_coeff_cost(
   } else {
     double ccc = get_coeff_cabac_cost(state, coeff_ptr, cu_loc, color, scan_mode, tr_skip, cur_tu);
     if (save_cccs) {
-      save_ccc(state->qp, coeff, width * width, ccc);
+      save_ccc(state->qp, coeff, width * height, ccc);
     }
     return ccc;
   }
@@ -1474,10 +1474,13 @@ void uvg_rdoq(
   // Hope the compiler is able to utilize this information.
   switch (cg_num) {
     case  1: FILL_ARRAY(sig_coeffgroup_flag, 0,  1); break;
+    case  2: FILL_ARRAY(sig_coeffgroup_flag, 0,  2); break;
     case  4: FILL_ARRAY(sig_coeffgroup_flag, 0,  4); break;
+    case  8: FILL_ARRAY(sig_coeffgroup_flag, 0,  8); break;
     case 16: FILL_ARRAY(sig_coeffgroup_flag, 0, 16); break;
+    case 32: FILL_ARRAY(sig_coeffgroup_flag, 0, 32); break;
     case 64: FILL_ARRAY(sig_coeffgroup_flag, 0, 64); break;
-    default: assert(0 && "There should be 1, 4, 16 or 64 coefficient groups");
+    default: assert(0 && "There should be 1, 2, 4, 8, 16, 32 or 64 coefficient groups");
   }
 
   cabac_ctx_t *base_coeff_group_ctx = &(cabac->ctx.sig_coeff_group_model[color ? 2 : 0]);
diff --git a/src/search.c b/src/search.c
index 2eead001..56f8f566 100644
--- a/src/search.c
+++ b/src/search.c
@@ -253,15 +253,16 @@ static void lcu_fill_cu_info(lcu_t *lcu, int x_local, int y_local, int width, in
 }
 
 
-static void lcu_fill_cbf(lcu_t *lcu, int x_local, unsigned y_local, unsigned width, const cu_info_t *cur_cu)
+static void lcu_fill_cbf(lcu_t *lcu, int x_local, unsigned y_local, unsigned width, unsigned height, const cu_info_t *cur_cu)
 {
-  const uint32_t mask = ~((MIN(width, TR_MAX_WIDTH))-1);
+  const uint32_t x_mask = ~((MIN(width, TR_MAX_WIDTH))-1);
+  const uint32_t y_mask = ~((MIN(height, TR_MAX_WIDTH))-1);
 
   // Set coeff flags in every CU covered by part_mode in this depth.
-  for (uint32_t y = y_local; y < y_local + width; y += SCU_WIDTH) {
+  for (uint32_t y = y_local; y < y_local + height; y += SCU_WIDTH) {
     for (uint32_t x = x_local; x < x_local + width; x += SCU_WIDTH) {
       // Use TU top-left CU to propagate coeff flags
-      cu_info_t *cu_from = LCU_GET_CU_AT_PX(lcu, x & mask, y & mask);
+      cu_info_t *cu_from = LCU_GET_CU_AT_PX(lcu, x & x_mask, y & y_mask);
       cu_info_t *cu_to   = LCU_GET_CU_AT_PX(lcu, x, y);
       if (cu_from != cu_to) {
         // Chroma and luma coeff data is needed for deblocking
@@ -943,6 +944,7 @@ static double search_cu(
   const int x = cu_loc->x;
   const int y = cu_loc->y;
   const int luma_width = cu_loc->width;
+  const int luma_height = cu_loc->height;
   assert(cu_width >= 4);
   double cost = MAX_DOUBLE;
   double inter_zero_coeff_cost = MAX_DOUBLE;
@@ -1009,7 +1011,7 @@ static double search_cu(
 
   // If the CU is completely inside the frame at this depth, search for
   // prediction modes at this depth.
-  if ( x + luma_width <= frame_width && y + luma_width <= frame_height)
+  if ( x + luma_width <= frame_width && y + luma_height <= frame_height)
   {
     int cu_width_inter_min = LCU_WIDTH >> pu_depth_inter.max;
     bool can_use_inter =
@@ -1022,7 +1024,7 @@ static double search_cu(
         // otherwise forbid it.
         (x & ~(cu_width_inter_min - 1)) + cu_width_inter_min > frame_width ||
         (y & ~(cu_width_inter_min - 1)) + cu_width_inter_min > frame_height
-      );
+      ) && cu_loc->width == cu_loc->height; // Don't allow non square inter CUs for now
 
     if (can_use_inter) {
       double mode_cost;
@@ -1179,7 +1181,7 @@ static double search_cu(
       if ((cur_cu->log2_height + cur_cu->log2_width < 6) || state->encoder_control->chroma_format == UVG_CSP_400 || tree_type == UVG_LUMA_T) {
         recon_chroma = false; 
       }
-      lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu);
+      lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_height, cur_cu);
       uvg_intra_recon_cu(state,
                          &intra_search, cu_loc,
                          NULL, lcu,
@@ -1226,7 +1228,7 @@ static double search_cu(
         if(cbf_cr) cbf_set(&split_cu->cbf, COLOR_V);
         split_cu->joint_cb_cr = jccr;
       }
-      lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu);
+      lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_height, cur_cu);
 
 
     } else if (cur_cu->type == CU_INTER || cur_cu->type == CU_IBC) {
@@ -1269,7 +1271,7 @@ static double search_cu(
         }
       }
       lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_height, cur_cu);
-      lcu_fill_cbf(lcu, x_local, y_local, cu_width, cur_cu);
+      lcu_fill_cbf(lcu, x_local, y_local, cu_width, cu_height, cur_cu);
     }
   }
 
@@ -1308,7 +1310,7 @@ static double search_cu(
     //  if (cur_cu->merged) {
     //    cur_cu->merged = 0;
     //    cur_cu->skipped = 1;
-    //    lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu);
+    //    lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_height, cur_cu);
     //  }
 
     //  cur_cu->cbf = 0;
@@ -1332,9 +1334,9 @@ static double search_cu(
 
   // Recursively split all the way to max search depth.
   if (can_split_cu) {
-    const split_tree_t new_split = { split_tree.split_tree | QT_SPLIT << (split_tree.current_depth * 3), split_tree.current_depth + 1 };
-
-    int half_cu = cu_width >> (tree_type != UVG_CHROMA_T);
+    const int split_type = depth == 0 ? QT_SPLIT : BT_HOR_SPLIT;
+    const split_tree_t new_split = { split_tree.split_tree | split_type << (split_tree.current_depth * 3), split_tree.current_depth + 1 };
+    
     double split_cost = 0.0;
     int cbf = cbf_is_set_any(cur_cu->cbf);
     cabac_data_t post_seach_cabac;
@@ -1389,19 +1391,14 @@ static double search_cu(
     if (cur_cu->type == CU_NOTSET || cbf || state->encoder_control->cfg.cu_split_termination == UVG_CU_SPLIT_TERMINATION_OFF) {
       initialize_partial_work_tree(lcu, &split_lcu, cu_loc, tree_type);
       cu_loc_t new_cu_loc[4];
-      uvg_get_split_locs(cu_loc, QT_SPLIT, new_cu_loc);
-      if (split_cost < cost) {
-        split_cost += search_cu(state, &new_cu_loc[0], &split_lcu, tree_type, new_split);
-      }
-      if (split_cost < cost) {
-        split_cost += search_cu(state, &new_cu_loc[1], &split_lcu, tree_type, new_split);
-      }
-      if (split_cost < cost) {
-        split_cost += search_cu(state, &new_cu_loc[2], &split_lcu, tree_type, new_split);
-      }
-      if (split_cost < cost) {
-        split_cost += search_cu(state, &new_cu_loc[3], &split_lcu, tree_type, new_split);
+      const int splits = uvg_get_split_locs(cu_loc, split_type, new_cu_loc);
+      for (int split = 0; split < splits; ++split) {
+        split_cost += search_cu(state, &new_cu_loc[split], &split_lcu, tree_type, new_split);
+        if (split_cost < cost) {
+          break;
+        }
       }
+
     } else {
       split_cost = INT_MAX;
     }
@@ -1442,7 +1439,7 @@ static double search_cu(
         cur_cu->lfnst_idx = 0;
         cur_cu->cr_lfnst_idx = 0;
         
-        lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu);
+        lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_height, cur_cu);
         
         intra_search_data_t proxy;
         FILL(proxy, 0);
@@ -1492,7 +1489,7 @@ static double search_cu(
       }
       // Add candidate when in inter slice or ibc is enabled
       if(state->frame->slicetype != UVG_SLICE_I || state->encoder_control->cfg.ibc) {
-        uvg_hmvp_add_mv(state, x, y, cu_width, cu_width, cur_cu);
+        uvg_hmvp_add_mv(state, x, y, cu_width, cu_height, cur_cu);
       }
     }
     else {
@@ -1520,7 +1517,7 @@ static double search_cu(
     }
     // Add candidate when in inter slice or ibc is enabled
     if(state->frame->slicetype != UVG_SLICE_I || state->encoder_control->cfg.ibc) {
-      uvg_hmvp_add_mv(state, x, y, cu_width, cu_width, cur_cu);
+      uvg_hmvp_add_mv(state, x, y, cu_width, cu_height, cur_cu);
     }
   }
 
diff --git a/src/search_inter.c b/src/search_inter.c
index 353eda31..37adaf27 100644
--- a/src/search_inter.c
+++ b/src/search_inter.c
@@ -2187,8 +2187,8 @@ void uvg_cu_cost_inter_rd2(
                               UVG_BOTH_T);
     ALIGNED(64) uvg_pixel u_pred[LCU_WIDTH_C * LCU_WIDTH_C];
     ALIGNED(64) uvg_pixel v_pred[LCU_WIDTH_C * LCU_WIDTH_C];
-    uvg_pixels_blit(&lcu->ref.u[index], u_pred, width, width, LCU_WIDTH_C, width);
-    uvg_pixels_blit(&lcu->ref.v[index], v_pred, width, width, LCU_WIDTH_C, width);
+    uvg_pixels_blit(&lcu->ref.u[index], u_pred, width, height, LCU_WIDTH_C, width);
+    uvg_pixels_blit(&lcu->ref.v[index], v_pred, width, height, LCU_WIDTH_C, width);
     ALIGNED(64) int16_t u_resi[LCU_WIDTH_C * LCU_WIDTH_C];
     ALIGNED(64) int16_t v_resi[LCU_WIDTH_C * LCU_WIDTH_C];
 
diff --git a/src/search_intra.c b/src/search_intra.c
index d6f0653f..d08b9d64 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -130,17 +130,31 @@ static INLINE uint8_t select_best_mode_index(const int8_t *modes, const double *
  *
  * \return  
  */
-static void get_cost_dual(encoder_state_t * const state, 
-                       const pred_buffer preds, const uvg_pixel *orig_block,
-                       cost_pixel_nxn_multi_func *satd_twin_func,
-                       cost_pixel_nxn_multi_func *sad_twin_func,
-                       int width, double *costs_out)
+static void get_cost_dual(
+  encoder_state_t * const state,
+  const pred_buffer preds,
+  const uvg_pixel *orig_block,
+  cost_pixel_nxn_multi_func *satd_twin_func,
+  cost_pixel_nxn_multi_func *sad_twin_func,
+  int width,
+  int height,
+  double *costs_out)
 {
   #define PARALLEL_BLKS 2
   unsigned satd_costs[PARALLEL_BLKS] = { 0 };
-  satd_twin_func(preds, orig_block, PARALLEL_BLKS, satd_costs);
+  if (satd_twin_func != NULL) {
+    satd_twin_func(preds, orig_block, PARALLEL_BLKS, satd_costs);
+  } else {
+    satd_costs[0] = uvg_satd_any_size(width, height, preds[0], width, orig_block, LCU_WIDTH);
+    satd_costs[1] = uvg_satd_any_size(width, height, preds[1], width, orig_block, LCU_WIDTH);
+  }
   unsigned unsigned_sad_costs[PARALLEL_BLKS] = { 0 };
-  sad_twin_func(preds, orig_block, PARALLEL_BLKS, unsigned_sad_costs);
+  if (sad_twin_func != NULL) {
+    sad_twin_func(preds, orig_block, PARALLEL_BLKS, unsigned_sad_costs);
+  } else {
+    unsigned_sad_costs[0] = uvg_reg_sad(preds[0], orig_block, width, height, width, LCU_WIDTH);
+    unsigned_sad_costs[1] = uvg_reg_sad(preds[1], orig_block, width, height, width, LCU_WIDTH);
+  }
   costs_out[0] = (double)MIN(satd_costs[0], unsigned_sad_costs[0] * 2);
   costs_out[1] = (double)MIN(satd_costs[1], unsigned_sad_costs[1] * 2);
 
@@ -651,7 +665,7 @@ static int search_intra_chroma_rough(
   uvg_pixel _orig_block[32 * 32 + SIMD_ALIGNMENT];
   uvg_pixel *orig_block = ALIGNED_POINTER(_orig_block, SIMD_ALIGNMENT);
 
-  uvg_pixels_blit(orig_u, orig_block, width, width, LCU_WIDTH_C, width);
+  uvg_pixels_blit(orig_u, orig_block, width, height, LCU_WIDTH_C, width);
   int modes_count = (state->encoder_control->cfg.cclm ? 8 : 5);
   for (int i = 0; i < modes_count; ++i) {
     const int8_t mode_chroma = chroma_data[i].pred_cu.intra.mode_chroma;
@@ -671,7 +685,7 @@ static int search_intra_chroma_rough(
     }
   }
 
-  uvg_pixels_blit(orig_v, orig_block, width, width, LCU_WIDTH_C, width);
+  uvg_pixels_blit(orig_v, orig_block, width, height, LCU_WIDTH_C, width);
   for (int i = 0; i < modes_count; ++i) {
     const int8_t mode_chroma = chroma_data[i].pred_cu.intra.mode_chroma;
     if (mode_chroma == luma_mode || mode_chroma == 0 || mode_chroma >= 81) continue;
@@ -764,7 +778,7 @@ static int16_t search_intra_rough(
   uvg_pixel *orig_block = ALIGNED_POINTER(_orig_block, SIMD_ALIGNMENT);
 
   // Store original block for SAD computation
-  uvg_pixels_blit(orig, orig_block, width, width, origstride, width);
+  uvg_pixels_blit(orig, orig_block, width, height, origstride, width);
 
   int8_t modes_selected = 0;
   // Note: get_cost and get_cost_dual may return negative costs.
@@ -783,7 +797,7 @@ static int16_t search_intra_rough(
 
   // Calculate SAD for evenly spaced modes to select the starting point for 
   // the recursive search.
-  cu_loc_t loc = { 0, 0, width, width, width, width };
+  cu_loc_t loc = { 0, 0, width, height, width, height };
   intra_search_data_t search_proxy;
   FILL(search_proxy, 0);
   search_proxy.pred_cu = *pred_cu;
@@ -963,19 +977,19 @@ static uint8_t search_intra_rough(
   uvg_pixel *orig,
   int32_t origstride,
   uvg_intra_references *refs,
-  int log2_width,
+  int width,
+  int height,
   int8_t *intra_preds,
   intra_search_data_t* modes_out,
   cu_info_t* const pred_cu,
   uint8_t mip_ctx)
 {
   #define PARALLEL_BLKS 2 // TODO: use 4 for AVX-512 in the future?
-  assert(log2_width >= 2 && log2_width <= 5);
-  int_fast8_t width = 1 << log2_width;
+  assert(width >= 4 && width <= 32);
   // cost_pixel_nxn_func *satd_func = kvz_pixels_get_satd_func(width);
   // cost_pixel_nxn_func *sad_func = kvz_pixels_get_sad_func(width);
-  cost_pixel_nxn_multi_func *satd_dual_func = uvg_pixels_get_satd_dual_func(width);
-  cost_pixel_nxn_multi_func *sad_dual_func = uvg_pixels_get_sad_dual_func(width);
+  cost_pixel_nxn_multi_func *satd_dual_func = uvg_pixels_get_satd_dual_func(width, height);
+  cost_pixel_nxn_multi_func *sad_dual_func = uvg_pixels_get_sad_dual_func(width, height);
   bool mode_checked[UVG_NUM_INTRA_MODES] = {0};
   double costs[UVG_NUM_INTRA_MODES];
 
@@ -990,7 +1004,7 @@ static uint8_t search_intra_rough(
   uvg_pixel *orig_block = ALIGNED_POINTER(_orig_block, SIMD_ALIGNMENT);
 
   // Store original block for SAD computation
-  uvg_pixels_blit(orig, orig_block, width, width, origstride, width);
+  uvg_pixels_blit(orig, orig_block, width, height, origstride, width);
 
   int8_t modes_selected = 0;
   // Note: get_cost and get_cost_dual may return negative costs.
@@ -1016,17 +1030,16 @@ static uint8_t search_intra_rough(
 
   // Calculate SAD for evenly spaced modes to select the starting point for 
   // the recursive search.
-  cu_loc_t loc = { 0, 0, width, width, width, width };
   intra_search_data_t search_proxy;
   FILL(search_proxy, 0);
   search_proxy.pred_cu = *pred_cu;
 
   int offset = 1 << state->encoder_control->cfg.intra_rough_search_levels;
   search_proxy.pred_cu.intra.mode = 0;
-  uvg_intra_predict(state, refs, &loc, COLOR_Y, preds[0], &search_proxy, NULL, UVG_LUMA_T);
+  uvg_intra_predict(state, refs, cu_loc, COLOR_Y, preds[0], &search_proxy, NULL, UVG_LUMA_T);
   search_proxy.pred_cu.intra.mode = 1;
-  uvg_intra_predict(state, refs, &loc, COLOR_Y, preds[1], &search_proxy, NULL, UVG_LUMA_T);
-  get_cost_dual(state, preds, orig_block, satd_dual_func, sad_dual_func, width, costs);
+  uvg_intra_predict(state, refs, cu_loc, COLOR_Y, preds[1], &search_proxy, NULL, UVG_LUMA_T);
+  get_cost_dual(state, preds, orig_block, satd_dual_func, sad_dual_func, width, height, costs);
   mode_checked[0] = true;
   mode_checked[1] = true;
   costs[0] += count_bits(
@@ -1075,12 +1088,12 @@ static uint8_t search_intra_rough(
     for (int i = 0; i < PARALLEL_BLKS; ++i) {
       if (mode + i * offset <= 66) {
         search_proxy.pred_cu.intra.mode = mode + i*offset;
-        uvg_intra_predict(state, refs, &loc, COLOR_Y, preds[i], &search_proxy, NULL, UVG_LUMA_T);
+        uvg_intra_predict(state, refs, cu_loc, COLOR_Y, preds[i], &search_proxy, NULL, UVG_LUMA_T);
       }
     }
     
     //TODO: add generic version of get cost  multi
-    get_cost_dual(state, preds, orig_block, satd_dual_func, sad_dual_func, width, costs_out);
+    get_cost_dual(state, preds, orig_block, satd_dual_func, sad_dual_func, width, height, costs_out);
     for (int i = 0; i < PARALLEL_BLKS; ++i) {
       if (mode + i * offset <= 66) {
         costs_out[i] += count_bits(
@@ -1147,12 +1160,12 @@ static uint8_t search_intra_rough(
       
         for (int block = 0; block < PARALLEL_BLKS; ++block) {
           search_proxy.pred_cu.intra.mode = modes_to_check[block + i];
-          uvg_intra_predict(state, refs, &loc, COLOR_Y, preds[block], &search_proxy, NULL, UVG_LUMA_T);
+          uvg_intra_predict(state, refs, cu_loc, COLOR_Y, preds[block], &search_proxy, NULL, UVG_LUMA_T);
         
         }
 
         //TODO: add generic version of get cost multi
-        get_cost_dual(state, preds, orig_block, satd_dual_func, sad_dual_func, width, costs_out);
+        get_cost_dual(state, preds, orig_block, satd_dual_func, sad_dual_func, width, height, costs_out);
         for (int block = 0; block < PARALLEL_BLKS; ++block) {
             costs_out[block] += count_bits(
               state,
@@ -1219,12 +1232,9 @@ static void get_rough_cost_for_2n_modes(
   const int height = cu_loc->height;
   cost_pixel_nxn_multi_func* satd_dual_func;
   cost_pixel_nxn_multi_func* sad_dual_func;
-  if (width == height) {
-    satd_dual_func = uvg_pixels_get_satd_dual_func(width);
-    sad_dual_func = uvg_pixels_get_sad_dual_func(width);
-  } else {
-    assert(false && "Joose promised to fix this.");
-  }
+  satd_dual_func = uvg_pixels_get_satd_dual_func(width, height);
+  sad_dual_func = uvg_pixels_get_sad_dual_func(width, height);
+
 
   uvg_pixel _preds[PARALLEL_BLKS * MIN(LCU_WIDTH, 64)* MIN(LCU_WIDTH, 64)+ SIMD_ALIGNMENT];
   pred_buffer preds = ALIGNED_POINTER(_preds, SIMD_ALIGNMENT);
@@ -1232,7 +1242,7 @@ static void get_rough_cost_for_2n_modes(
   uvg_pixel _orig_block[MIN(LCU_WIDTH, 64) * MIN(LCU_WIDTH, 64) + SIMD_ALIGNMENT];
   uvg_pixel* orig_block = ALIGNED_POINTER(_orig_block, SIMD_ALIGNMENT);
 
-  uvg_pixels_blit(orig, orig_block, width, width, orig_stride, width);
+  uvg_pixels_blit(orig, orig_block, width, height, orig_stride, width);
   
   const double mrl = state->encoder_control->cfg.mrl && (cu_loc->y % LCU_WIDTH) ? CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.multi_ref_line[0]), 1) : 0;
   const double not_mip = state->encoder_control->cfg.mip ? CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.mip_flag[mip_ctx]), 0) : 0;
@@ -1243,7 +1253,7 @@ static void get_rough_cost_for_2n_modes(
     for (int i = 0; i < PARALLEL_BLKS; ++i) {
       uvg_intra_predict(state, &refs[search_data[mode + i].pred_cu.intra.multi_ref_idx], cu_loc, COLOR_Y, preds[i], &search_data[mode + i], NULL, UVG_LUMA_T);
     }
-    get_cost_dual(state, preds, orig_block, satd_dual_func, sad_dual_func, width, costs_out);
+    get_cost_dual(state, preds, orig_block, satd_dual_func, sad_dual_func, width, height, costs_out);
 
     for(int i = 0; i < PARALLEL_BLKS; ++i) {
       uint8_t multi_ref_idx = search_data[mode + i].pred_cu.intra.multi_ref_idx;
@@ -1796,16 +1806,17 @@ void uvg_search_cu_intra(
   bool skip_rough_search = (is_large || state->encoder_control->cfg.rdo >= 4);
   if (!skip_rough_search) {
     num_regular_modes = number_of_modes = search_intra_rough(
-      state,
-      cu_loc,
-      ref_pixels,
-      LCU_WIDTH,
-      refs,
-      log2_width,
-      candidate_modes,
-      search_data,
-      &temp_pred_cu,
-      mip_ctx);
+                          state,
+                          cu_loc,
+                          ref_pixels,
+                          LCU_WIDTH,
+                          refs,
+                          cu_loc->width,
+                          cu_loc->height,
+                          candidate_modes,
+                          search_data,
+                          &temp_pred_cu,
+                          mip_ctx);
     // if(lines == 1) sort_modes(search_data, number_of_modes);
 
   } else {
diff --git a/src/strategies/avx2/quant-avx2.c b/src/strategies/avx2/quant-avx2.c
index 8c7b1c36..2d45166c 100644
--- a/src/strategies/avx2/quant-avx2.c
+++ b/src/strategies/avx2/quant-avx2.c
@@ -720,7 +720,7 @@ int uvg_quantize_residual_avx2(encoder_state_t *const state,
   }
 
   // Check if there are any non-zero coefficients.
-  for (int i = 0; i < width * width; i += 8) {
+  for (int i = 0; i < width * height; i += 8) {
     __m128i v_quant_coeff = _mm_loadu_si128((__m128i*)&(coeff_out[i]));
     has_coeffs = !_mm_testz_si128(_mm_set1_epi8(0xFF), v_quant_coeff);
     if(has_coeffs) break;
@@ -730,7 +730,7 @@ int uvg_quantize_residual_avx2(encoder_state_t *const state,
   // rec_out.
   if (has_coeffs && !early_skip) {
     // Get quantized residual. (coeff_out -> coeff -> residual)
-    uvg_dequant(state, coeff_out, coeff, width, width, color,
+    uvg_dequant(state, coeff_out, coeff, width, height, color,
       cur_cu->type, cur_cu->tr_idx == MTS_SKIP && color == COLOR_Y);
 
     if (state->encoder_control->cfg.lfnst && cur_cu->type == CU_INTRA) {
diff --git a/src/strategies/generic/quant-generic.c b/src/strategies/generic/quant-generic.c
index 81927486..eed95e59 100644
--- a/src/strategies/generic/quant-generic.c
+++ b/src/strategies/generic/quant-generic.c
@@ -315,22 +315,22 @@ int uvg_quant_cbcr_residual_generic(
   if (state->encoder_control->cfg.rdoq_enable &&
     (width > 4 || !state->encoder_control->cfg.rdoq_skip))
   {
-    uvg_rdoq(state, coeff, coeff_out, width, width, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U,
+    uvg_rdoq(state, coeff, coeff_out, width, height, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U,
              scan_order, cur_cu->type, cur_cu->cbf, cur_cu->cr_lfnst_idx);
   }
   else if (state->encoder_control->cfg.rdoq_enable && false) {
-    uvg_ts_rdoq(state, coeff, coeff_out, width, width, cur_cu->joint_cb_cr == 2 ? COLOR_V : COLOR_U,
+    uvg_ts_rdoq(state, coeff, coeff_out, width, height, cur_cu->joint_cb_cr == 2 ? COLOR_V : COLOR_U,
       scan_order);
   }
   else {
-    uvg_quant(state, coeff, coeff_out, width, width, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U,
+    uvg_quant(state, coeff, coeff_out, width, height, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U,
       scan_order, cur_cu->type, cur_cu->tr_idx == MTS_SKIP && false, cur_cu->lfnst_idx);
   }
 
   int8_t has_coeffs = 0;
   {
     int i;
-    for (i = 0; i < width * width; ++i) {
+    for (i = 0; i < width * height; ++i) {
       if (coeff_out[i] != 0) {
         has_coeffs = 1;
         break;
@@ -341,10 +341,10 @@ int uvg_quant_cbcr_residual_generic(
   if (has_coeffs && !early_skip) {
 
     // Get quantized residual. (coeff_out -> coeff -> residual)
-    uvg_dequant(state, coeff_out, coeff, width, width, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U,
+    uvg_dequant(state, coeff_out, coeff, width, height, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U,
       cur_cu->type, cur_cu->tr_idx == MTS_SKIP && false);
     if (cur_cu->cr_lfnst_idx) {
-      uvg_inv_lfnst(cur_cu, width, width, COLOR_UV, cur_cu->cr_lfnst_idx, coeff, tree_type);
+      uvg_inv_lfnst(cur_cu, width, height, COLOR_UV, cur_cu->cr_lfnst_idx, coeff, tree_type);
     }
     
     uvg_itransform2d(state->encoder_control, combined_residual, coeff, width, height, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U, cur_cu);
diff --git a/src/strategies/strategies-picture.c b/src/strategies/strategies-picture.c
index 00ad9ccb..37d3cb75 100644
--- a/src/strategies/strategies-picture.c
+++ b/src/strategies/strategies-picture.c
@@ -115,103 +115,116 @@ int uvg_strategy_register_picture(void* opaque, uint8_t bitdepth) {
 /**
 * \brief  Get a function that calculates SATD for NxN block.
 *
-* \param n  Width of the region for which SATD is calculated.
+* \param width  Width of the region for which SATD is calculated.
 *
 * \returns  Pointer to cost_16bit_nxn_func.
 */
-cost_pixel_nxn_func * uvg_pixels_get_satd_func(unsigned n)
+cost_pixel_nxn_func * uvg_pixels_get_satd_func(unsigned width, unsigned height)
 {
-  switch (n) {
-  case 4:
-    return uvg_satd_4x4;
-  case 8:
-    return uvg_satd_8x8;
-  case 16:
-    return uvg_satd_16x16;
-  case 32:
-    return uvg_satd_32x32;
-  case 64:
-    return uvg_satd_64x64;
-  default:
-    return NULL;
+  if(width == height) {
+    switch (width) {
+      case 4:
+        return uvg_satd_4x4;
+      case 8:
+        return uvg_satd_8x8;
+      case 16:
+        return uvg_satd_16x16;
+      case 32:
+        return uvg_satd_32x32;
+      case 64:
+        return uvg_satd_64x64;
+      default:
+        return NULL;
+    }
   }
+  return NULL;
 }
 
 
 /**
 * \brief  Get a function that calculates SAD for NxN block.
 *
-* \param n  Width of the region for which SAD is calculated.
+* \param width  Width of the region for which SAD is calculated.
 *
 * \returns  Pointer to cost_16bit_nxn_func.
 */
-cost_pixel_nxn_func * uvg_pixels_get_sad_func(unsigned n)
+cost_pixel_nxn_func * uvg_pixels_get_sad_func(unsigned width, unsigned height)
 {
-  switch (n) {
-  case 4:
-    return uvg_sad_4x4;
-  case 8:
-    return uvg_sad_8x8;
-  case 16:
-    return uvg_sad_16x16;
-  case 32:
-    return uvg_sad_32x32;
-  case 64:
-    return uvg_sad_64x64;
-  default:
-    return NULL;
+  if(width == height) {
+    switch (width) {
+      case 4:
+        return uvg_sad_4x4;
+      case 8:
+        return uvg_sad_8x8;
+      case 16:
+        return uvg_sad_16x16;
+      case 32:
+        return uvg_sad_32x32;
+      case 64:
+        return uvg_sad_64x64;
+      default:
+        return NULL;
+    }
   }
+  return NULL;
 }
 
 /**
 * \brief  Get a function that calculates SATDs for 2 NxN blocks.
 *
-* \param n  Width of the region for which SATD is calculated.
+* \param width  Width of the region for which SATD is calculated.
+* \param height  Height of the region for which SATD is calculated.
 *
 * \returns  Pointer to cost_pixel_nxn_multi_func.
 */
-cost_pixel_nxn_multi_func * uvg_pixels_get_satd_dual_func(unsigned n)
+cost_pixel_nxn_multi_func * uvg_pixels_get_satd_dual_func(unsigned width, unsigned height)
 {
-  switch (n) {
-  case 4:
-    return uvg_satd_4x4_dual;
-  case 8:
-    return uvg_satd_8x8_dual;
-  case 16:
-    return uvg_satd_16x16_dual;
-  case 32:
-    return uvg_satd_32x32_dual;
-  case 64:
-    return uvg_satd_64x64_dual;
-  default:
-    return NULL;
+  if(width == height) {
+    switch (width) {
+      case 4:
+        return uvg_satd_4x4_dual;
+      case 8:
+        return uvg_satd_8x8_dual;
+      case 16:
+        return uvg_satd_16x16_dual;
+      case 32:
+        return uvg_satd_32x32_dual;
+      case 64:
+        return uvg_satd_64x64_dual;
+      default:
+        return NULL;
+    }
   }
+  return NULL;
 }
 
 
 /**
 * \brief  Get a function that calculates SADs for 2 NxN blocks.
 *
-* \param n  Width of the region for which SAD is calculated.
+* \param width  Width of the region for which SAD is calculated.
 *
 * \returns  Pointer to cost_pixel_nxn_multi_func.
 */
-cost_pixel_nxn_multi_func * uvg_pixels_get_sad_dual_func(unsigned n)
+cost_pixel_nxn_multi_func * uvg_pixels_get_sad_dual_func(unsigned width, unsigned height)
 {
-  switch (n) {
-  case 4:
-    return uvg_sad_4x4_dual;
-  case 8:
-    return uvg_sad_8x8_dual;
-  case 16:
-    return uvg_sad_16x16_dual;
-  case 32:
-    return uvg_sad_32x32_dual;
-  case 64:
-    return uvg_sad_64x64_dual;
-  default:
-    return NULL;
+  if(width == height) {
+    switch (width) {
+      case 4:
+        return uvg_sad_4x4_dual;
+      case 8:
+        return uvg_sad_8x8_dual;
+      case 16:
+        return uvg_sad_16x16_dual;
+      case 32:
+        return uvg_sad_32x32_dual;
+      case 64:
+        return uvg_sad_64x64_dual;
+      default:
+        return NULL;
+    }
   }
+  return NULL;
 }
 
 // Precomputed CRC32C lookup table for polynomial 0x04C11DB7
diff --git a/src/strategies/strategies-picture.h b/src/strategies/strategies-picture.h
index 8d73f74c..286a0735 100644
--- a/src/strategies/strategies-picture.h
+++ b/src/strategies/strategies-picture.h
@@ -203,8 +203,8 @@ extern pixel_var_func *uvg_pixel_var;
 extern generate_residual_func* uvg_generate_residual;
 
 int uvg_strategy_register_picture(void* opaque, uint8_t bitdepth);
-cost_pixel_nxn_multi_func * uvg_pixels_get_satd_dual_func(unsigned n);
-cost_pixel_nxn_multi_func * uvg_pixels_get_sad_dual_func(unsigned n);
+cost_pixel_nxn_multi_func * uvg_pixels_get_satd_dual_func(unsigned width, unsigned height);
+cost_pixel_nxn_multi_func * uvg_pixels_get_sad_dual_func(unsigned width, unsigned height);
 
 #define STRATEGIES_PICTURE_EXPORTS \
   {"crc32c_4x4", (void**) &uvg_crc32c_4x4}, \
diff --git a/src/transform.c b/src/transform.c
index fbd6afeb..0169a0ff 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -584,7 +584,7 @@ void uvg_chroma_transform_search(
     if (is_jccr && !u_has_coeffs) continue;
 
     if (u_has_coeffs) {
-      uvg_dequant(state, u_quant_coeff, &u_coeff[i * trans_offset], width, width, transforms[i] != JCCR_1 ? COLOR_U : COLOR_V,
+      uvg_dequant(state, u_quant_coeff, &u_coeff[i * trans_offset], width, height, transforms[i] != JCCR_1 ? COLOR_U : COLOR_V,
         pred_cu->type, transforms[i] == CHROMA_TS);
 
       if (transforms[i] != CHROMA_TS) {
diff --git a/src/uvg266.h b/src/uvg266.h
index e3d8c0f9..d2726655 100644
--- a/src/uvg266.h
+++ b/src/uvg266.h
@@ -543,9 +543,18 @@ typedef struct uvg_config
 
   uint8_t dual_tree;
 
+  uint8_t min_qt_size[3];
+  uint8_t max_bt_size[3];
+  uint8_t max_tt_size[3];
+
+  uint8_t max_intra_slice_btt_depth;
+  uint8_t max_intra_slice_btt_depth_chroma;
+  uint8_t max_inter_slice_btt_depth;
+
   uint8_t intra_rough_search_levels;
 
   uint8_t ibc; /* \brief Intra Block Copy parameter */
+
 } uvg_config;
 
 /**
diff --git a/tools/generate_tables.c b/tools/generate_tables.c
index d50c889f..6bd2497e 100644
--- a/tools/generate_tables.c
+++ b/tools/generate_tables.c
@@ -51,7 +51,7 @@ static void init_sig_last_scan(uint32_t *buff_d, uint32_t *buff_h,
                                uint32_t *buff_v,
                                int32_t width, int32_t height)
 {
-  uint32_t num_scan_pos  = width * width;
+  uint32_t num_scan_pos  = width * height;
   uint32_t next_scan_pos = 0;
   int32_t  xx, yy, x, y;
   uint32_t scan_line;

From f19084569d21f9109b5752b896fb90e5d14bd609 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 16 Nov 2022 12:27:28 +0200
Subject: [PATCH 115/254] WIP

---
 src/cabac.h                            |  2 +
 src/cfg.c                              | 16 +++++
 src/context.c                          | 20 ++++++
 src/cu.h                               |  1 +
 src/encode_coding_tree.c               | 99 ++++++++++++++------------
 src/encode_coding_tree.h               | 10 +--
 src/encoder_state-bitstream.c          | 24 +++----
 src/encoderstate.c                     |  2 +-
 src/rdo.c                              | 27 +++----
 src/scalinglist.c                      | 16 +++--
 src/search.c                           | 18 +++--
 src/search_inter.c                     | 20 ++++--
 src/strategies/avx2/quant-avx2.c       | 12 ++--
 src/strategies/generic/quant-generic.c | 15 ++--
 src/transform.h                        |  4 +-
 src/uvg266.h                           |  6 +-
 16 files changed, 185 insertions(+), 107 deletions(-)

diff --git a/src/cabac.h b/src/cabac.h
index be249ba2..f38030a9 100644
--- a/src/cabac.h
+++ b/src/cabac.h
@@ -77,6 +77,8 @@ typedef struct
     cabac_ctx_t mts_idx_model[4];
     cabac_ctx_t split_flag_model[9]; //!< \brief split flag context models
     cabac_ctx_t qt_split_flag_model[6]; //!< \brief qt split flag context models
+    cabac_ctx_t mtt_vertical_model[5]; 
+    cabac_ctx_t mtt_binary_model[4]; 
     cabac_ctx_t intra_luma_mpm_flag_model;    //!< \brief intra mode context models
     cabac_ctx_t intra_subpart_model[2];    //!< \brief intra sub part context models
     cabac_ctx_t chroma_pred_model;
diff --git a/src/cfg.c b/src/cfg.c
index f2073da5..39643e9f 100644
--- a/src/cfg.c
+++ b/src/cfg.c
@@ -222,6 +222,22 @@ int uvg_config_init(uvg_config *cfg)
   cfg->cabac_debug_file_name = NULL;
 
   cfg->dual_tree = 0;
+
+  cfg->min_qt_size[0] = 4;
+  cfg->min_qt_size[1] = 4;
+  cfg->min_qt_size[2] = 4;
+
+  cfg->max_btt_depth[0] = 1;
+  cfg->max_btt_depth[1] = 0;
+  cfg->max_btt_depth[2] = 0;
+
+  cfg->max_tt_size[0] = 64;
+  cfg->max_bt_size[0] = 64;
+  cfg->max_tt_size[1] = 64;
+  cfg->max_bt_size[1] = 64;
+  cfg->max_tt_size[2] = 64;
+  cfg->max_bt_size[2] = 64;
+
   cfg->intra_rough_search_levels = 2;
 
   cfg->ibc = 0;
diff --git a/src/context.c b/src/context.c
index 708b9da4..30861849 100644
--- a/src/context.c
+++ b/src/context.c
@@ -50,6 +50,21 @@ static const uint8_t  INIT_QT_SPLIT_FLAG[4][6] = {
   {   0,   8,   8,  12,  12,   8, },
 };
 
+
+static const uint8_t INIT_VERTICAL_SPLIT_FLAG[4][5] = {
+  {  43,  42,  37,  42,  44, },
+  {  43,  35,  37,  34,  52, },
+  {  43,  42,  29,  27,  44, },
+  {   9,   8,   9,   8,   5, },
+};
+
+static const uint8_t INIT_BINARY_SPLIT_FLAG[4][4] = {
+  {  28,  29,  28,  29, },
+  {  43,  37,  21,  22, },
+  {  36,  45,  36,  45, },
+  {  12,  13,  12,  13, },
+  };
+
 static const uint8_t INIT_SKIP_FLAG[4][3] = {
   {  57,  60,  46, },
   {  57,  59,  45, },
@@ -574,6 +589,11 @@ void uvg_init_contexts(encoder_state_t *state, int8_t QP, int8_t slice)
     uvg_ctx_init(&cabac->ctx.part_size_model[i], QP, INIT_PART_SIZE[slice][i], INIT_PART_SIZE[3][i]);
     uvg_ctx_init(&cabac->ctx.bdpcm_mode[i], QP, BDPCM_MODE_INIT[slice][i], BDPCM_MODE_INIT[3][i]);
     uvg_ctx_init(&cabac->ctx.qt_cbf_model_luma[i], QP, INIT_QT_CBF[slice][i], INIT_QT_CBF[3][i]);
+    uvg_ctx_init(&cabac->ctx.mtt_binary_model[i], QP, INIT_BINARY_SPLIT_FLAG[slice][i], INIT_BINARY_SPLIT_FLAG[3][i]);
+  }
+
+  for (i = 0; i < 5; i++) {
+    uvg_ctx_init(&cabac->ctx.mtt_vertical_model[i], QP, INIT_VERTICAL_SPLIT_FLAG[slice][i], INIT_VERTICAL_SPLIT_FLAG[3][i]);
   }
 
   for (i = 0; i < 6; i++) {  
diff --git a/src/cu.h b/src/cu.h
index 7f1bd0e3..cc2f6925 100644
--- a/src/cu.h
+++ b/src/cu.h
@@ -105,6 +105,7 @@ enum split_type {
 typedef struct  {
   uint32_t split_tree;
   uint8_t current_depth;
+  uint8_t mtt_depth;
 } split_tree_t;
 
 
diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index 4468390c..ac8d206e 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -1199,14 +1199,13 @@ void uvg_encode_intra_luma_coding_unit(
 }
 
 
-bool uvg_write_split_flag(
-  const encoder_state_t * const state,
+uint8_t uvg_write_split_flag(
+  const encoder_state_t* const state,
   cabac_data_t* cabac,
-  const cu_info_t * left_cu,
-  const cu_info_t * above_cu,
+  const cu_info_t* left_cu,
+  const cu_info_t* above_cu,
   const cu_loc_t* const cu_loc,
-  const uint32_t split_tree,
-  int depth,
+  split_tree_t split_tree,
   enum uvg_tree_type tree_type,
   double* bits_out)
 {
@@ -1217,15 +1216,15 @@ bool uvg_write_split_flag(
   // Implisit split flag when on border
   // Exception made in VVC with flag not being implicit if the BT can be used for
   // horizontal or vertical split, then this flag tells if QT or BT is used
+  const int slice_type = state->frame->is_irap ? (tree_type == UVG_CHROMA_T ? 2 : 0) : 1;
 
   bool no_split, allow_qt, bh_split, bv_split, th_split, tv_split;
   no_split = allow_qt = bh_split = bv_split = th_split = tv_split = true;
-  if (depth > MAX_DEPTH) allow_qt = false;
-  // ToDo: update this when btt is actually used
-  bool allow_btt = true;// when mt_depth < MAX_BT_DEPTH
   
   const int cu_width = tree_type != UVG_CHROMA_T ? cu_loc->width : cu_loc->chroma_width;
   const int cu_height = tree_type != UVG_CHROMA_T ? cu_loc->height : cu_loc->chroma_height;
+  if (cu_width == state->encoder_control->cfg.min_qt_size[slice_type] || split_tree.mtt_depth > 0) allow_qt = false;
+  bool allow_btt = state->encoder_control->cfg.max_btt_depth[slice_type] > split_tree.mtt_depth && cu_width <= 64;
 
   uint8_t implicit_split_mode = UVG_NO_SPLIT;
   //bool implicit_split = border;
@@ -1255,10 +1254,16 @@ bool uvg_write_split_flag(
   if (!allow_btt) {
     bh_split = bv_split = th_split = tv_split = false;
   }
+  else {
+    bv_split &= cu_width <= state->encoder_control->cfg.max_bt_size[slice_type];
+    tv_split &= cu_width <= state->encoder_control->cfg.max_tt_size[slice_type];
+    bh_split &= cu_height <= state->encoder_control->cfg.max_bt_size[slice_type];
+    th_split &= cu_height <= state->encoder_control->cfg.max_tt_size[slice_type];
+  }
 
   bool allow_split = allow_qt | bh_split | bv_split | th_split | tv_split;
 
-  int split_flag = (split_tree >> (depth * 3)) & 7;
+  int split_flag = (split_tree.split_tree >> (split_tree.current_depth * 3)) & 7;
 
   split_flag = implicit_split_mode != UVG_NO_SPLIT ? implicit_split_mode : split_flag;
 
@@ -1286,33 +1291,41 @@ bool uvg_write_split_flag(
 
     cabac->cur_ctx = &(cabac->ctx.split_flag_model[split_model]);
 
-    CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.split_flag_model[split_model]), split_flag != 0, bits, "split_flag");
+    CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.split_flag_model[split_model]), split_flag != NO_SPLIT, bits, "split_cu_flag");
   }
 
-  bool qt_split = split_flag == QT_SPLIT;
 
-  if (!(implicit_split_mode == UVG_NO_SPLIT) && (allow_qt && allow_btt)) {
-    split_model = (left_cu && GET_SPLITDATA(left_cu, depth)) + (above_cu && GET_SPLITDATA(above_cu, depth)) + (depth < 2 ? 0 : 3);
-    CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_split_flag_model[split_model]), qt_split, bits, "QT_split_flag");
-  }
-
-  // Only signal split when it is not implicit, currently only Qt split supported
-  if (!(implicit_split_mode == UVG_NO_SPLIT) && !qt_split && (bh_split | bv_split | th_split | tv_split)) {
-
-    split_model = 0;
-
-    // TODO: These are incorrect
-    if (left_cu && (1 << left_cu->log2_height) > cu_height) {
-      split_model++;
+  if (implicit_split_mode == UVG_NO_SPLIT && allow_qt && (bh_split || bv_split || th_split || tv_split) && split_flag != NO_SPLIT) {
+    bool qt_split = split_flag == QT_SPLIT;
+    if((bv_split || bh_split || tv_split || th_split) && allow_qt) {
+      split_model = (left_cu && GET_SPLITDATA(left_cu, split_tree.current_depth)) + (above_cu && GET_SPLITDATA(above_cu, split_tree.current_depth)) + (split_tree.current_depth < 2 ? 0 : 3);
+      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_split_flag_model[split_model]), qt_split, bits, "qt_split_flag");
     }
-
-    if (above_cu && (1 << above_cu->log2_width) > cu_width) {
-      split_model++;
+    if (!qt_split) {
+      const bool is_vertical = split_flag == BT_VER_SPLIT || split_flag == TT_VER_SPLIT;
+      if((bh_split || th_split) && (bv_split || tv_split)) {
+        split_model = 0;
+        if(bv_split + tv_split > bh_split + th_split) {
+          split_model = 4;
+        } else if(bv_split + tv_split < bh_split + th_split) {
+          split_model = 3;
+        } else {
+          const int d_a = cu_width / (above_cu ? (1 << above_cu->log2_width) : 1);
+          const int d_l = cu_height / (left_cu ? (1 << left_cu->log2_height) : 1);
+          if(d_a != d_l && above_cu && left_cu) {
+            split_model = d_a < d_l ? 1 : 2;
+          }
+        }
+        CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.mtt_vertical_model[split_model]), is_vertical, bits, "mtt_vertical_flag");
+      }
+      if ((bv_split && tv_split && is_vertical) || (bh_split && th_split && !is_vertical)) {
+        split_model = 2 * is_vertical + split_tree.mtt_depth <= 1;
+        CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.mtt_binary_model[split_model]), 
+          split_flag == BT_VER_SPLIT || split_flag == BT_HOR_SPLIT, bits, "mtt_binary_flag");
+      }
     }
-
-    split_model += (depth > 2 ? 0 : 3);
-    CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_split_flag_model[split_model]), qt_split, bits, "split_cu_mode");
   }
+
   if (bits_out) *bits_out += bits;
   return split_flag;
 }
@@ -1322,7 +1335,7 @@ void uvg_encode_coding_tree(
   lcu_coeff_t *coeff,
   enum uvg_tree_type tree_type,
   const cu_loc_t* const cu_loc,
-  const split_tree_t split_tree)
+  split_tree_t split_tree)
 {
   cabac_data_t * const cabac = &state->cabac;
   const encoder_control_t * const ctrl = state->encoder_control;
@@ -1332,8 +1345,7 @@ void uvg_encode_coding_tree(
   
   const int cu_width  = tree_type != UVG_CHROMA_T ? cu_loc->width : cu_loc->chroma_width;
   const int cu_height = tree_type != UVG_CHROMA_T ? cu_loc->height : cu_loc->chroma_height;
-  const int half_cu  = cu_width >> 1;
-
+ 
   const int x = cu_loc->x;
   const int y = cu_loc->y;
 
@@ -1357,9 +1369,9 @@ void uvg_encode_coding_tree(
   int32_t frame_height = tree_type != UVG_CHROMA_T ? ctrl->in.height : ctrl->in.height / 2;
   // Check for slice border
   bool border_x = frame_width  < abs_x + cu_width;
-  bool border_y = frame_height < abs_y + cu_width;
-  bool border_split_x = frame_width  >= abs_x + (LCU_WIDTH >> MAX_DEPTH) + half_cu;
-  bool border_split_y = frame_height >= abs_y + (LCU_WIDTH >> MAX_DEPTH) + half_cu;
+  bool border_y = frame_height < abs_y + cu_height;
+  bool border_split_x = frame_width  >= abs_x + (LCU_WIDTH >> MAX_DEPTH) + cu_width / 2;
+  bool border_split_y = frame_height >= abs_y + (LCU_WIDTH >> MAX_DEPTH) + cu_height / 2;
   bool border = border_x || border_y; /*!< are we in any border CU */
 
   if (depth <= state->frame->max_qp_delta_depth) {
@@ -1368,21 +1380,20 @@ void uvg_encode_coding_tree(
 
   // When not in MAX_DEPTH, insert split flag and split the blocks if needed
   if (cu_width + cu_height > 8) {
-
+    split_tree.split_tree = cur_cu->split_tree;
     const int split_flag = uvg_write_split_flag(
       state,
       cabac,
       left_cu,
       above_cu, 
       cu_loc,
-      cur_cu->split_tree,
-      depth,
+      split_tree,
       tree_type,
       NULL);
     
     if (split_flag || border) {
       const int half_luma = cu_loc->width / 2;
-      split_tree_t new_split_tree = { cur_cu->split_tree, split_tree.current_depth + 1 };
+      const split_tree_t new_split_tree = { cur_cu->split_tree, split_tree.current_depth + 1, split_tree.mtt_depth + (split_flag != QT_SPLIT)};
 
       cu_loc_t new_cu_loc[4];
       const int splits = uvg_get_split_locs(cu_loc, split_flag, new_cu_loc);
@@ -1650,7 +1661,8 @@ double uvg_mock_encode_coding_unit(
   const cu_loc_t* const cu_loc,
   lcu_t* lcu,
   cu_info_t* cur_cu,
-  enum uvg_tree_type tree_type) {
+  enum uvg_tree_type tree_type,
+  const split_tree_t split_tree) {
   double bits = 0;
   const encoder_control_t* const ctrl = state->encoder_control;
 
@@ -1692,8 +1704,7 @@ double uvg_mock_encode_coding_unit(
       left_cu,
       above_cu,
       cu_loc,
-      cur_cu->split_tree,
-      depth,
+      split_tree,
       tree_type,
       &bits);
   }
diff --git a/src/encode_coding_tree.h b/src/encode_coding_tree.h
index 86605e4d..357e059a 100644
--- a/src/encode_coding_tree.h
+++ b/src/encode_coding_tree.h
@@ -54,7 +54,7 @@ void uvg_encode_coding_tree(
   lcu_coeff_t *coeff,
   enum uvg_tree_type tree_type,
   const cu_loc_t* const cu_loc,
-  const split_tree_t split_tree);
+  split_tree_t split_tree);
 
 void uvg_encode_ts_residual(encoder_state_t* const state,
   cabac_data_t* const cabac,
@@ -77,7 +77,8 @@ double uvg_mock_encode_coding_unit(
   const cu_loc_t* const cu_loc,
   lcu_t* lcu,
   cu_info_t* cur_cu,
-  enum uvg_tree_type tree_type);
+  enum uvg_tree_type tree_type,
+  const split_tree_t split_tree);
 
 int uvg_encode_inter_prediction_unit(
   encoder_state_t* const state,
@@ -96,14 +97,13 @@ void uvg_encode_intra_luma_coding_unit(
   double* bits_out);
 
 
-bool uvg_write_split_flag(
+uint8_t uvg_write_split_flag(
   const encoder_state_t* const state,
   cabac_data_t* cabac,
   const cu_info_t* left_cu,
   const cu_info_t* above_cu,
   const cu_loc_t* const cu_loc,
-  const uint32_t split_tree,
-  int depth,
+  split_tree_t,
   enum uvg_tree_type tree_type,
   double* bits_out);
 
diff --git a/src/encoder_state-bitstream.c b/src/encoder_state-bitstream.c
index 1649d944..8e9f7c52 100644
--- a/src/encoder_state-bitstream.c
+++ b/src/encoder_state-bitstream.c
@@ -529,10 +529,10 @@ static void encoder_state_write_bitstream_seq_parameter_set(bitstream_t* stream,
   // if(!no_partition_constraints_override_constraint_flag)
     WRITE_U(stream, 0, 1, "partition_constraints_override_enabled_flag");
   WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.min_qt_size[0]] - MIN_SIZE, "sps_log2_diff_min_qt_min_cb_intra_slice_luma");
-  WRITE_UE(stream, encoder->cfg.max_intra_slice_btt_depth, "sps_max_mtt_hierarchy_depth_intra_slice_luma");
-  if (encoder->cfg.max_intra_slice_btt_depth) {
-    WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.min_qt_size[0]] - uvg_g_convert_to_log2[encoder->cfg.max_bt_size[0]], "sps_log2_diff_max_bt_min_qt_intra_slice_luma");
-    WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.min_qt_size[0]] - uvg_g_convert_to_log2[encoder->cfg.max_tt_size[0]], "sps_log2_diff_max_tt_min_qt_intra_slice_luma");
+  WRITE_UE(stream, encoder->cfg.max_btt_depth[0], "sps_max_mtt_hierarchy_depth_intra_slice_luma");
+  if (encoder->cfg.max_btt_depth[0]) {
+    WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.max_bt_size[0]] - uvg_g_convert_to_log2[encoder->cfg.min_qt_size[0]], "sps_log2_diff_max_bt_min_qt_intra_slice_luma");
+    WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.max_tt_size[0]] - uvg_g_convert_to_log2[encoder->cfg.min_qt_size[0]], "sps_log2_diff_max_tt_min_qt_intra_slice_luma");
   }
   
   if (encoder->chroma_format != UVG_CSP_400)
@@ -541,17 +541,17 @@ static void encoder_state_write_bitstream_seq_parameter_set(bitstream_t* stream,
   }
   if (encoder->cfg.dual_tree) {
     WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.min_qt_size[2]] - MIN_SIZE, "sps_log2_diff_min_qt_min_cb_intra_slice_chroma");
-    WRITE_UE(stream, encoder->cfg.max_intra_slice_btt_depth_chroma, "sps_max_mtt_hierarchy_depth_intra_slice_chroma");
-    if (encoder->cfg.max_intra_slice_btt_depth_chroma) {
-      WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.min_qt_size[2]] - uvg_g_convert_to_log2[encoder->cfg.max_bt_size[2]], "sps_log2_diff_max_bt_min_qt_intra_slice_chroma");
-      WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.min_qt_size[2]] - uvg_g_convert_to_log2[encoder->cfg.max_tt_size[2]], "sps_log2_diff_max_tt_min_qt_intra_slice_chroma");
+    WRITE_UE(stream, encoder->cfg.max_btt_depth[2], "sps_max_mtt_hierarchy_depth_intra_slice_chroma");
+    if (encoder->cfg.max_btt_depth[2]) {
+      WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.max_bt_size[2]] - uvg_g_convert_to_log2[encoder->cfg.min_qt_size[2]], "sps_log2_diff_max_bt_min_qt_intra_slice_chroma");
+      WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.max_tt_size[2]] - uvg_g_convert_to_log2[encoder->cfg.min_qt_size[2]], "sps_log2_diff_max_tt_min_qt_intra_slice_chroma");
     }
   }
   WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.min_qt_size[1]] - MIN_SIZE, "sps_log2_diff_min_qt_min_cb_inter_slice");
-  WRITE_UE(stream, encoder->cfg.max_inter_slice_btt_depth, "sps_max_mtt_hierarchy_depth_inter_slice");
-  if (encoder->cfg.max_inter_slice_btt_depth != 0) {
-    WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.min_qt_size[1]] - uvg_g_convert_to_log2[encoder->cfg.max_bt_size[1]], "sps_log2_diff_max_bt_min_qt_inter_tile_group");
-    WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.min_qt_size[1]] - uvg_g_convert_to_log2[encoder->cfg.max_tt_size[1]], "sps_log2_diff_max_tt_min_qt_inter_tile_group");
+  WRITE_UE(stream, encoder->cfg.max_btt_depth[1], "sps_max_mtt_hierarchy_depth_inter_slice");
+  if (encoder->cfg.max_btt_depth[1] != 0) {
+    WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.max_bt_size[1]] - uvg_g_convert_to_log2[encoder->cfg.min_qt_size[1]], "sps_log2_diff_max_bt_min_qt_inter_tile_group");
+    WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.max_tt_size[1]] - uvg_g_convert_to_log2[encoder->cfg.min_qt_size[1]], "sps_log2_diff_max_tt_min_qt_inter_tile_group");
   }
 
   if (LCU_WIDTH > 32)
diff --git a/src/encoderstate.c b/src/encoderstate.c
index eb529b2b..e8af6add 100644
--- a/src/encoderstate.c
+++ b/src/encoderstate.c
@@ -883,7 +883,7 @@ static void encoder_state_worker_encode_lcu_bitstream(void * opaque)
   //Encode coding tree
   cu_loc_t start;
   uvg_cu_loc_ctor(&start, lcu->position.x * LCU_WIDTH, lcu->position.y * LCU_WIDTH, LCU_WIDTH, LCU_WIDTH);
-  split_tree_t split_tree = { 0, 0 };
+  split_tree_t split_tree = { 0, 0, 0 };
 
   uvg_encode_coding_tree(state, lcu->coeff, tree_type, &start, split_tree);
 
diff --git a/src/rdo.c b/src/rdo.c
index 26f31634..5fef3b3c 100644
--- a/src/rdo.c
+++ b/src/rdo.c
@@ -705,19 +705,20 @@ static void calc_last_bits(encoder_state_t * const state, int32_t width, int32_t
  * tables generated during RDOQ to select the best coefficient to change.
  */
 void uvg_rdoq_sign_hiding(
-    const encoder_state_t *const state,
-    const int32_t qp_scaled,
-    const uint32_t *const scan2raster,
-    const struct sh_rates_t *const sh_rates,
-    const int32_t last_pos,
-    const coeff_t *const coeffs,
-    coeff_t *const quant_coeffs, 
-    const int8_t color)
+  const encoder_state_t *const state,
+  const int32_t qp_scaled,
+  const uint32_t *const scan2raster,
+  const struct sh_rates_t *const sh_rates,
+  const int32_t last_pos,
+  const coeff_t *const coeffs,
+  coeff_t *const quant_coeffs,
+  const int8_t color,
+  const bool need_sqrt_adjust)
 {
   const encoder_control_t * const ctrl = state->encoder_control;
   const double lambda = color ? state->c_lambda : state->lambda;
 
-  int inv_quant = uvg_g_inv_quant_scales[qp_scaled % 6];
+  int inv_quant = uvg_g_inv_quant_scales[need_sqrt_adjust][qp_scaled % 6];
   // This somehow scales quant_delta into fractional bits. Instead of the bits
   // being multiplied by lambda, the residual is divided by it, or something
   // like that.
@@ -1203,7 +1204,7 @@ int uvg_ts_rdoq(encoder_state_t* const state, coeff_t* src_coeff, coeff_t* dest_
 
   const bool   needs_sqrt2_scale = false; // from VTM: should always be false - transform-skipped blocks don't require sqrt(2) compensation.
   const int    q_bits = QUANT_SHIFT + qp_scaled / 6  + (needs_sqrt2_scale ? -1 : 0);  // Right shift of non-RDOQ quantizer;  level = (coeff*uiQ + offset)>>q_bits
-  const int32_t quant_coeff = uvg_g_quant_scales[qp_scaled % 6];
+  const int32_t quant_coeff = uvg_g_quant_scales[needs_sqrt2_scale][qp_scaled % 6];
  
   const double error_scale = (double)(1 << CTX_FRAC_BITS) / quant_coeff / quant_coeff;
 
@@ -1416,8 +1417,10 @@ void uvg_rdoq(
   cabac_data_t * const cabac = &state->cabac;
   const uint32_t log2_block_width = uvg_g_convert_to_log2[width];
   const uint32_t log2_block_height = uvg_g_convert_to_log2[height];
+  bool needs_block_size_trafo_scale = !false && ((log2_block_width + log2_block_height) % 2 == 1);
+  needs_block_size_trafo_scale |= 1; // Non log2 block size
 
-  int32_t  transform_shift   = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_block_width + log2_block_height) >> 1);  // Represents scaling through forward transform
+  int32_t  transform_shift   = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_block_width + log2_block_height) >> 1) + needs_block_size_trafo_scale;  // Represents scaling through forward transform
   uint16_t go_rice_param     = 0;
   uint32_t reg_bins = (width * height * 28) >> 4;
   
@@ -1789,7 +1792,7 @@ void uvg_rdoq(
   }
 
   if (encoder->cfg.signhide_enable && abs_sum >= 2) {
-    uvg_rdoq_sign_hiding(state, qp_scaled, scan, &sh_rates, best_last_idx_p1, coef, dest_coeff, color);
+    uvg_rdoq_sign_hiding(state, qp_scaled, scan, &sh_rates, best_last_idx_p1, coef, dest_coeff, color, needs_block_size_trafo_scale);
   }
 }
 
diff --git a/src/scalinglist.c b/src/scalinglist.c
index 5c32ac4c..01edfa27 100644
--- a/src/scalinglist.c
+++ b/src/scalinglist.c
@@ -88,8 +88,14 @@ static const int32_t g_quant_inter_default_8x8[64] =
   24, 25, 28, 33, 41, 54, 71, 91
 };
 
-const int16_t uvg_g_quant_scales[6] = {26214, 23302, 20560, 18396, 16384, 14564};
-const int16_t uvg_g_inv_quant_scales[6] = {40, 45, 51, 57, 64, 72};
+const int16_t uvg_g_quant_scales[2][6] = {
+  {26214, 23302, 20560, 18396, 16384, 14564},
+    { 18396,16384,14564,13107,11651,10280 }
+};
+const int16_t uvg_g_inv_quant_scales[2][6] = {
+  {40, 45, 51, 57, 64, 72},
+  { 57,64,72,80,90,102 }
+};
 
 
 /**
@@ -406,11 +412,11 @@ void uvg_scalinglist_set(scaling_list_t* const scaling_list, const int32_t* cons
   int32_t* quantcoeff = (int32_t*)scaling_list->quant_coeff[size_id_x][size_id_y][listId][qp];
   int32_t* dequantcoeff = (int32_t*)scaling_list->de_quant_coeff[size_id_x][size_id_y][listId][qp];
 
-  // Encoder list
-  uvg_scalinglist_process_enc(coeff, quantcoeff, uvg_g_quant_scales[qp] << 4, height, width, ratio,
+  // Encoder list TODO: the sqrt adjusted lists
+  uvg_scalinglist_process_enc(coeff, quantcoeff, uvg_g_quant_scales[0][qp] << 4, height, width, ratio,
                               MIN(8, g_scaling_list_size_x[size_id_x]), dc, !scaling_list->enable);
   // Decoder list
-  scalinglist_process_dec(coeff, dequantcoeff, uvg_g_inv_quant_scales[qp], height, width, ratio,
+  scalinglist_process_dec(coeff, dequantcoeff, uvg_g_inv_quant_scales[0][qp], height, width, ratio,
                           MIN(8, g_scaling_list_size_x[size_id_x]), dc, !scaling_list->enable);
 
 
diff --git a/src/search.c b/src/search.c
index 56f8f566..f61ce721 100644
--- a/src/search.c
+++ b/src/search.c
@@ -1294,7 +1294,8 @@ static double search_cu(
       tree_type != UVG_CHROMA_T ? cu_loc : &chroma_loc, 
       lcu,
       cur_cu,
-      tree_type);
+      tree_type,
+      split_tree);
 
     
     cost = bits * state->lambda;
@@ -1335,7 +1336,11 @@ static double search_cu(
   // Recursively split all the way to max search depth.
   if (can_split_cu) {
     const int split_type = depth == 0 ? QT_SPLIT : BT_HOR_SPLIT;
-    const split_tree_t new_split = { split_tree.split_tree | split_type << (split_tree.current_depth * 3), split_tree.current_depth + 1 };
+    const split_tree_t new_split = {
+      split_tree.split_tree | split_type << (split_tree.current_depth * 3),
+      split_tree.current_depth + 1,
+      split_tree.mtt_depth + (split_type != QT_SPLIT),
+    };
     
     double split_cost = 0.0;
     int cbf = cbf_is_set_any(cur_cu->cbf);
@@ -1374,8 +1379,7 @@ static double search_cu(
         left_cu,
         above_cu, 
         tree_type != UVG_CHROMA_T ? cu_loc : &chroma_loc,
-        new_split.split_tree,
-        depth,
+        split_tree,
         tree_type,
         &split_bits);
     }
@@ -1394,7 +1398,7 @@ static double search_cu(
       const int splits = uvg_get_split_locs(cu_loc, split_type, new_cu_loc);
       for (int split = 0; split < splits; ++split) {
         split_cost += search_cu(state, &new_cu_loc[split], &split_lcu, tree_type, new_split);
-        if (split_cost < cost) {
+        if (split_cost > cost) {
           break;
         }
       }
@@ -1426,7 +1430,7 @@ static double search_cu(
         double bits = 0;
         uvg_write_split_flag(state, &state->search_cabac,
                              x > 0 ? LCU_GET_CU_AT_PX(lcu, SUB_SCU(x) - 1, SUB_SCU(y)) : NULL,
-                             y > 0 ? LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y) - 1) : NULL, cu_loc, cur_cu->split_tree, depth, tree_type, &bits);
+                             y > 0 ? LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y) - 1) : NULL, cu_loc, split_tree, tree_type, &bits);
 
         cur_cu->intra = cu_d1->intra;
         cur_cu->type = CU_INTRA;
@@ -1715,7 +1719,7 @@ void uvg_search_lcu(encoder_state_t * const state, const int x, const int y, con
 
   cu_loc_t start;
   uvg_cu_loc_ctor(&start, x, y, LCU_WIDTH, LCU_WIDTH);
-  split_tree_t split_tree = { 0, 0 };
+  split_tree_t split_tree = { 0, 0, 0 };
   // Start search from depth 0.
   double cost = search_cu(
     state, 
diff --git a/src/search_inter.c b/src/search_inter.c
index 37adaf27..4703152a 100644
--- a/src/search_inter.c
+++ b/src/search_inter.c
@@ -2124,9 +2124,7 @@ void uvg_cu_cost_inter_rd2(
   double   *inter_cost,
   double* inter_bitcost,
   const cu_loc_t* const cu_loc){
-
-  const uint8_t depth = 6 - uvg_g_convert_to_log2[cu_loc->width];
-
+  
   const int x_px = SUB_SCU(cu_loc->x);
   const int y_px = SUB_SCU(cu_loc->y);
   const int width = cu_loc->width;
@@ -2160,12 +2158,24 @@ void uvg_cu_cost_inter_rd2(
   double no_cbf_bits;
   double bits = 0;
   const int skip_context = uvg_get_skip_context(cu_loc->x, cu_loc->y, lcu, NULL, NULL);
+
+  int8_t depth = 0;
+  int8_t mtt_depth = 0;
+  uint32_t splits = cur_cu->split_tree;
+  while (splits & 7) {
+    if ((splits & 7) != QT_SPLIT) {
+      mtt_depth++;
+    }
+    depth++;
+    splits >>= 3;
+  }
+  const split_tree_t splitt_tree = { cur_cu->split_tree, depth, mtt_depth };
   if (cur_cu->merged) {
     no_cbf_bits = CTX_ENTROPY_FBITS(&state->cabac.ctx.cu_skip_flag_model[skip_context], 1) + *inter_bitcost;
-    bits += uvg_mock_encode_coding_unit(state, cabac, cu_loc, lcu, cur_cu, UVG_BOTH_T);
+    bits += uvg_mock_encode_coding_unit(state, cabac, cu_loc, lcu, cur_cu, UVG_BOTH_T, splitt_tree);
   }
   else {
-    no_cbf_bits = uvg_mock_encode_coding_unit(state, cabac, cu_loc, lcu, cur_cu, UVG_BOTH_T);
+    no_cbf_bits = uvg_mock_encode_coding_unit(state, cabac, cu_loc, lcu, cur_cu, UVG_BOTH_T, splitt_tree);
     bits += no_cbf_bits - CTX_ENTROPY_FBITS(&cabac->ctx.cu_qt_root_cbf_model, 0) + CTX_ENTROPY_FBITS(&cabac->ctx.cu_qt_root_cbf_model, 1);
   }
   double no_cbf_cost = ssd + no_cbf_bits * state->lambda;
diff --git a/src/strategies/avx2/quant-avx2.c b/src/strategies/avx2/quant-avx2.c
index 2d45166c..8c967bdb 100644
--- a/src/strategies/avx2/quant-avx2.c
+++ b/src/strategies/avx2/quant-avx2.c
@@ -386,11 +386,13 @@ void uvg_quant_avx2(const encoder_state_t * const state, const coeff_t * __restr
 
   int32_t qp_scaled = uvg_get_scaled_qp(color, state->qp, (encoder->bitdepth - 8) * 6, encoder->qp_map[0]);
   qp_scaled = transform_skip ? MAX(qp_scaled, 4 + 6 * MIN_QP_PRIME_TS) : qp_scaled;
+  bool needs_block_size_trafo_scale = !transform_skip && ((log2_tr_height + log2_tr_width) % 2 == 1);
+  needs_block_size_trafo_scale |= 1; // Non log2 block size
   
   const int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)color;
   const int32_t *quant_coeff = encoder->scaling_list.quant_coeff[log2_tr_width][log2_tr_height][scalinglist_type][qp_scaled % 6];
   const int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_width + log2_tr_height) >> 1); //!< Represents scaling through forward transform
-  const int64_t q_bits = QUANT_SHIFT + qp_scaled / 6 + (transform_skip ? 0 : transform_shift);
+  const int64_t q_bits = QUANT_SHIFT + qp_scaled / 6 + (transform_skip ? 0 : transform_shift - needs_block_size_trafo_scale);
   const int32_t add = ((state->frame->slicetype == UVG_SLICE_I) ? 171 : 85) << (q_bits - 9);
   const int32_t q_bits8 = q_bits - 8;
 
@@ -792,13 +794,15 @@ void uvg_dequant_avx2(const encoder_state_t * const state, coeff_t *q_coef, coef
   int32_t n;
   const uint32_t log2_tr_width =  uvg_g_convert_to_log2[width];
   const uint32_t log2_tr_height = uvg_g_convert_to_log2[height];
-  int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_width + log2_tr_height) >> 1); // Represents scaling through forward transform
+  int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_width + log2_tr_height) >> 1);
+  bool needs_block_size_trafo_scale = !transform_skip && ((log2_tr_height + log2_tr_width) % 2 == 1);
+  needs_block_size_trafo_scale |= 1; // Non log2 block size// Represents scaling through forward transform
 
 
   int32_t qp_scaled = uvg_get_scaled_qp(color, state->qp, (encoder->bitdepth-8)*6, encoder->qp_map[0]);
   qp_scaled = transform_skip ? MAX(qp_scaled, 4 + 6 * MIN_QP_PRIME_TS) : qp_scaled;
 
-  shift = 20 - QUANT_SHIFT - (transform_skip ? 0 : transform_shift);
+  shift = 20 - QUANT_SHIFT - (transform_skip ? 0 : transform_shift - needs_block_size_trafo_scale);
 
   if (encoder->scaling_list.enable)
   {
@@ -822,7 +826,7 @@ void uvg_dequant_avx2(const encoder_state_t * const state, coeff_t *q_coef, coef
       }
     }
   } else {
-    int32_t scale = uvg_g_inv_quant_scales[qp_scaled%6] << (qp_scaled/6);
+    int32_t scale = uvg_g_inv_quant_scales[needs_block_size_trafo_scale][qp_scaled%6] << (qp_scaled/6);
     add = 1 << (shift-1);
 
     __m256i v_scale = _mm256_set1_epi32(scale);
diff --git a/src/strategies/generic/quant-generic.c b/src/strategies/generic/quant-generic.c
index eed95e59..13e08f3a 100644
--- a/src/strategies/generic/quant-generic.c
+++ b/src/strategies/generic/quant-generic.c
@@ -68,12 +68,13 @@ void uvg_quant_generic(
 
   int32_t qp_scaled = uvg_get_scaled_qp(color, state->qp, (encoder->bitdepth - 8) * 6, encoder->qp_map[0]);
   qp_scaled = transform_skip ? MAX(qp_scaled, 4 + 6 * MIN_QP_PRIME_TS) : qp_scaled;
-  
-  
+  bool needs_block_size_trafo_scale = !transform_skip && ((log2_tr_height + log2_tr_width) % 2 == 1);
+  needs_block_size_trafo_scale |= 1; // Non log2 block size
+    
   const int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)color;
   const int32_t *quant_coeff = encoder->scaling_list.quant_coeff[log2_tr_width][log2_tr_height][scalinglist_type][qp_scaled % 6];
-  const int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_height + log2_tr_width) >> 1); //!< Represents scaling through forward transform
-  const int64_t q_bits = QUANT_SHIFT + qp_scaled / 6 + (transform_skip ? 0 : transform_shift);
+  const int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_height + log2_tr_width) >> 1) - needs_block_size_trafo_scale; //!< Represents scaling through forward transform
+  const int64_t q_bits = QUANT_SHIFT + qp_scaled / 6 + (transform_skip ? 0 : transform_shift );
   const int32_t add = ((state->frame->slicetype == UVG_SLICE_I) ? 171 : 85) << (q_bits - 9);
   const int32_t q_bits8 = q_bits - 8;
 
@@ -592,11 +593,13 @@ void uvg_dequant_generic(const encoder_state_t * const state, coeff_t *q_coef, c
   const uint32_t log2_tr_height = uvg_g_convert_to_log2[height];
   int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_width + log2_tr_height) >> 1); // Represents scaling through forward transform
 
+  bool needs_block_size_trafo_scale = !transform_skip && ((log2_tr_height + log2_tr_width) % 2 == 1);
+  needs_block_size_trafo_scale |= 1; // Non log2 block size
 
   int32_t qp_scaled = uvg_get_scaled_qp(color, state->qp, (encoder->bitdepth-8)*6, encoder->qp_map[0]);
   qp_scaled = transform_skip ? MAX(qp_scaled, 4 + 6 * MIN_QP_PRIME_TS) : qp_scaled;
 
-  shift = 20 - QUANT_SHIFT - (transform_skip ? 0 : transform_shift);
+  shift = 20 - QUANT_SHIFT - (transform_skip ? 0 : transform_shift - needs_block_size_trafo_scale);
 
   if (encoder->scaling_list.enable)
   {
@@ -620,7 +623,7 @@ void uvg_dequant_generic(const encoder_state_t * const state, coeff_t *q_coef, c
       }
     }
   } else {
-    int32_t scale = uvg_g_inv_quant_scales[qp_scaled%6] << (qp_scaled/6);
+    int32_t scale = uvg_g_inv_quant_scales[needs_block_size_trafo_scale][qp_scaled%6] << (qp_scaled/6);
     add = 1 << (shift-1);
 
     for (n = 0; n < width * height; n++) {
diff --git a/src/transform.h b/src/transform.h
index ebe31109..e96a2893 100644
--- a/src/transform.h
+++ b/src/transform.h
@@ -44,8 +44,8 @@
 #include "global.h" // IWYU pragma: keep
 
 extern const uint8_t uvg_g_chroma_scale[58];
-extern const int16_t uvg_g_inv_quant_scales[6];
-extern const int16_t uvg_g_quant_scales[6];
+extern const int16_t uvg_g_inv_quant_scales[2][6];
+extern const int16_t uvg_g_quant_scales[2][6];
 
 #define COEFF_ORDER_LINEAR 0
 #define COEFF_ORDER_CU 1
diff --git a/src/uvg266.h b/src/uvg266.h
index d2726655..7d772780 100644
--- a/src/uvg266.h
+++ b/src/uvg266.h
@@ -543,13 +543,11 @@ typedef struct uvg_config
 
   uint8_t dual_tree;
 
-  uint8_t min_qt_size[3];
+  uint8_t min_qt_size[3];  /* intra, inter, dual tree chroma*/
   uint8_t max_bt_size[3];
   uint8_t max_tt_size[3];
 
-  uint8_t max_intra_slice_btt_depth;
-  uint8_t max_intra_slice_btt_depth_chroma;
-  uint8_t max_inter_slice_btt_depth;
+  uint8_t max_btt_depth[3];
 
   uint8_t intra_rough_search_levels;
 

From 70cbaae619faeabeb58d41ab514cf89804b689db Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Thu, 17 Nov 2022 07:38:26 +0200
Subject: [PATCH 116/254] [mtt] square root adjustment for quantization

---
 src/rdo.c                              | 4 +++-
 src/strategies/avx2/quant-avx2.c       | 4 +++-
 src/strategies/generic/quant-generic.c | 6 +++++-
 3 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/rdo.c b/src/rdo.c
index 5fef3b3c..791b5696 100644
--- a/src/rdo.c
+++ b/src/rdo.c
@@ -1431,6 +1431,8 @@ void uvg_rdoq(
   int32_t q_bits = QUANT_SHIFT + qp_scaled/6 + transform_shift;
 
   const double lambda = color ? state->c_lambda : state->lambda;
+  const int32_t default_quant_coeff = uvg_g_quant_scales[needs_block_size_trafo_scale][qp_scaled % 6];
+  const bool use_scaling_list = state->encoder_control->cfg.scaling_list != UVG_SCALING_LIST_OFF;
 
   const int32_t *quant_coeff  = encoder->scaling_list.quant_coeff[log2_block_width][log2_block_height][scalinglist_type][qp_scaled%6];
   const double *err_scale     = encoder->scaling_list.error_scale[log2_block_width][log2_block_height][scalinglist_type][qp_scaled%6];
@@ -1509,7 +1511,7 @@ void uvg_rdoq(
 
       if (lfnst_idx > 0 && scanpos > max_lfnst_pos) break;
       uint32_t blkpos         = scan[scanpos];
-      int32_t q               = quant_coeff[blkpos];
+      int32_t q               = use_scaling_list ? quant_coeff[blkpos] : default_quant_coeff;
       int32_t level_double    = coef[blkpos];
       level_double            = MIN(abs(level_double) * q, MAX_INT - (1 << (q_bits - 1)));
       uint32_t max_abs_level  = (level_double + (1 << (q_bits - 1))) >> q_bits;
diff --git a/src/strategies/avx2/quant-avx2.c b/src/strategies/avx2/quant-avx2.c
index 8c967bdb..84cdd436 100644
--- a/src/strategies/avx2/quant-avx2.c
+++ b/src/strategies/avx2/quant-avx2.c
@@ -396,6 +396,8 @@ void uvg_quant_avx2(const encoder_state_t * const state, const coeff_t * __restr
   const int32_t add = ((state->frame->slicetype == UVG_SLICE_I) ? 171 : 85) << (q_bits - 9);
   const int32_t q_bits8 = q_bits - 8;
 
+  const int32_t default_quant_coeff = uvg_g_quant_scales[needs_block_size_trafo_scale][qp_scaled % 6];
+
   uint32_t ac_sum = 0;
   int32_t last_cg = -1;
 
@@ -404,7 +406,7 @@ void uvg_quant_avx2(const encoder_state_t * const state, const coeff_t * __restr
   // Loading once is enough if scaling lists are not off
   __m256i low_b = _mm256_setzero_si256(), high_b = _mm256_setzero_si256();
   if (!(state->encoder_control->scaling_list.enable)) {
-    low_b  = _mm256_set1_epi32(quant_coeff[0]);
+    low_b  = _mm256_set1_epi32(default_quant_coeff);
     high_b = low_b;
   }
 
diff --git a/src/strategies/generic/quant-generic.c b/src/strategies/generic/quant-generic.c
index 13e08f3a..daa302a3 100644
--- a/src/strategies/generic/quant-generic.c
+++ b/src/strategies/generic/quant-generic.c
@@ -78,8 +78,12 @@ void uvg_quant_generic(
   const int32_t add = ((state->frame->slicetype == UVG_SLICE_I) ? 171 : 85) << (q_bits - 9);
   const int32_t q_bits8 = q_bits - 8;
 
+  const int32_t default_quant_coeff = uvg_g_quant_scales[needs_block_size_trafo_scale][qp_scaled % 6];
+
   uint32_t ac_sum = 0;
 
+  const bool use_scaling_list = state->encoder_control->cfg.scaling_list != UVG_SCALING_LIST_OFF;
+
   if(lfnst_idx == 0){
     for (int32_t n = 0; n < width * height; n++) {
       int32_t level = coef[n];
@@ -88,7 +92,7 @@ void uvg_quant_generic(
 
       sign = (level < 0 ? -1 : 1);
 
-      int32_t curr_quant_coeff = quant_coeff[n];
+      int32_t curr_quant_coeff = use_scaling_list ? quant_coeff[n] : default_quant_coeff;
       level = (int32_t)((abs_level * curr_quant_coeff + add) >> q_bits);
       ac_sum += level;
 

From 5ba8d45981449bebfefe526a844dea80d1a65625 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Thu, 17 Nov 2022 14:20:09 +0200
Subject: [PATCH 117/254] WIP

---
 src/intra.c                            | 12 +++---
 src/search.c                           |  8 ++--
 src/strategies/generic/intra-generic.c | 52 ++++----------------------
 3 files changed, 17 insertions(+), 55 deletions(-)

diff --git a/src/intra.c b/src/intra.c
index 99150ef2..d6f9ad3f 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -324,8 +324,8 @@ static void get_cclm_parameters(
   //int left_below_units = total_left_units - tu_height_in_units;
   //int avai_above_right_units = 0;  // TODO these are non zero only with non-square CUs
   //int avai_left_below_units = 0;
-  int avai_above_units = CLIP(0, tu_height_in_units, y0/base_unit_size);
-  int avai_left_units = CLIP(0, tu_width_in_units, x0 / base_unit_size);
+  int avai_above_units = y0 ? tu_width_in_units : 0;
+  int avai_left_units = x0 ? tu_height_in_units : 0;
 
   bool above_available = avai_above_units != 0;
   bool left_available = avai_left_units != 0;
@@ -559,7 +559,7 @@ static void predict_cclm(
   }
 
   if(x0) {
-    if (x_scu == 0) available_left_below = MIN(MIN(width / 2, (64 - y_scu - height * 2) / 2), (state->tile->frame->height - y0 - height * 2) / 2);
+    if (x_scu == 0) available_left_below = MIN(MIN(height / 2, (64 - y_scu - height * 2) / 2), (state->tile->frame->height - y0 - height * 2) / 2);
     for (; available_left_below < height / 2; available_left_below++) {
       int y_extension = y_scu + height * 2 + 4 * available_left_below;
       y_extension >>= tree_type == UVG_CHROMA_T;
@@ -937,7 +937,7 @@ static void intra_predict_regular(
   uint8_t isp = color == COLOR_Y ? isp_mode : 0;
 
   const uvg_intra_ref *used_ref = &refs->ref;
-  if (cfg->intra_smoothing_disabled || color != COLOR_Y || mode == 1 || (width == 4 && height == 4) || multi_ref_index || width != height /*ISP_TODO: replace this fake ISP check*/) {
+  if (cfg->intra_smoothing_disabled || color != COLOR_Y || mode == 1 || (width == 4 && height == 4) || multi_ref_index || isp_mode /*ISP_TODO: replace this fake ISP check*/) {
     // For chroma, DC and 4x4 blocks, always use unfiltered reference.
   } else if (mode == 0) {
     // Otherwise, use filtered for planar.
@@ -1400,7 +1400,7 @@ void uvg_intra_build_reference_inner(
 
   // Limit the number of available pixels based on block size and dimensions
   // of the picture.
-  px_available_left = MIN(px_available_left, cu_height * 2);
+  px_available_left = MIN(px_available_left, height * 2);
   px_available_left = MIN(px_available_left, (pic_px->y - luma_px->y) >> is_chroma);
 
   // Copy pixels from coded CUs.
@@ -1438,7 +1438,7 @@ void uvg_intra_build_reference_inner(
 
   // Extend for MRL
   if (multi_ref_index) {
-    for (; i < width * 2 + multi_ref_index; ++i) {
+    for (; i < height * 2 + multi_ref_index; ++i) {
       out_left_ref[i + 1] = nearest_pixel;
     }
   }
diff --git a/src/search.c b/src/search.c
index f61ce721..25a2ea1c 100644
--- a/src/search.c
+++ b/src/search.c
@@ -1085,7 +1085,7 @@ static double search_cu(
                            false);
 
         downsample_cclm_rec(
-          state, x, y, cu_width / 2, cu_width / 2, lcu->rec.y, lcu->left_ref.y[64]
+          state, x, y, cu_width / 2, cu_height / 2, lcu->rec.y, lcu->left_ref.y[64]
         );
       }
       double intra_cost = intra_search.cost;
@@ -1479,7 +1479,7 @@ static double search_cu(
       // search.
       memcpy(&state->search_cabac, &post_seach_cabac, sizeof(post_seach_cabac));
       downsample_cclm_rec(
-        state, x, y, cu_width / 2, cu_width / 2, lcu->rec.y, lcu->left_ref.y[64]
+        state, x, y, cu_width / 2, cu_height / 2, lcu->rec.y, lcu->left_ref.y[64]
       );
 
       if (state->frame->slicetype != UVG_SLICE_I) {
@@ -1498,7 +1498,7 @@ static double search_cu(
     }
     else {
       downsample_cclm_rec(
-        state, x, y, cu_width / 2, cu_width / 2, lcu->rec.y, lcu->left_ref.y[64]
+        state, x, y, cu_width / 2, cu_height / 2, lcu->rec.y, lcu->left_ref.y[64]
       );      
     }
   } else if (cur_cu->log2_height + cur_cu->log2_width > 4) {
@@ -1506,7 +1506,7 @@ static double search_cu(
     // when searching SMP and AMP blocks.
     if(tree_type != UVG_CHROMA_T) {
       downsample_cclm_rec(
-        state, x, y, cu_width / 2, cu_width / 2, lcu->rec.y, lcu->left_ref.y[64]
+        state, x, y, cu_width / 2, cu_height / 2, lcu->rec.y, lcu->left_ref.y[64]
       );
     }
 
diff --git a/src/strategies/generic/intra-generic.c b/src/strategies/generic/intra-generic.c
index a0bd430d..f07fa020 100644
--- a/src/strategies/generic/intra-generic.c
+++ b/src/strategies/generic/intra-generic.c
@@ -61,8 +61,8 @@ static void uvg_angular_pred_generic(
   const uint8_t multi_ref_idx,
   const uint8_t isp_mode)
 {
-  const int width  = channel_type == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
-  const int height = channel_type == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
+  int width  = channel_type == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
+  int height = channel_type == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
   const int log2_width  = uvg_g_convert_to_log2[width];
   const int log2_height = uvg_g_convert_to_log2[height];
   
@@ -180,13 +180,6 @@ static void uvg_angular_pred_generic(
     }
   }
 
-  // Flip dimensions for horizontal modes
-  int tmp_width = vertical_mode ? width : height;
-  int tmp_height = vertical_mode ? height : width;
-
-  uvg_pixel tmp_dst[LCU_WIDTH * LCU_WIDTH];
-  uvg_pixel* dst_buf = vertical_mode ? dst : tmp_dst;
-
 
   // compensate for line offset in reference line buffers
   ref_main += multi_ref_index;
@@ -207,7 +200,7 @@ static void uvg_angular_pred_generic(
           uvg_pixel p[4];
           bool use_cubic = true; // Default to cubic filter
           static const int uvg_intra_hor_ver_dist_thres[8] = { 24, 24, 24, 14, 2, 0, 0, 0 };
-          int filter_threshold = uvg_intra_hor_ver_dist_thres[log2_width];
+          int filter_threshold = uvg_intra_hor_ver_dist_thres[(log2_width + log2_height) >> 1];
           int dist_from_vert_or_hor = MIN(abs((int32_t)pred_mode - 50), abs((int32_t)pred_mode - 18));
           if (dist_from_vert_or_hor > filter_threshold) {
             if ((abs(sample_disp) & 0x1F) != 0)
@@ -270,38 +263,6 @@ static void uvg_angular_pred_generic(
           dst_buf[y * tmp_width + x] = dst_buf[y * tmp_width + x] + ((wL * (left - dst_buf[y * tmp_width + x]) + 32) >> 6);
         }
       }
-
-        /*
-      if (pred_mode == 2 || pred_mode == 66) {
-        int wT = 16 >> MIN(31, ((y << 1) >> scale));
-        for (int x = 0; x < width; x++) {
-          int wL = 16 >> MIN(31, ((x << 1) >> scale));
-          if (wT + wL == 0) break;
-          int c = x + y + 1;
-          if (c >= 2 * width) { wL = 0; }
-          if (c >= 2 * width) { wT = 0; }
-          const uvg_pixel left = (wL != 0) ? ref_side[c] : 0;
-          const uvg_pixel top  = (wT != 0) ? ref_main[c] : 0;
-          dst[y * width + x] = CLIP_TO_PIXEL((wL * left + wT * top + (64 - wL - wT) * dst[y * width + x] + 32) >> 6);
-        }
-      } else if (sample_disp == 0 || sample_disp >= 12) {
-        int inv_angle_sum_0 = 2;
-        for (int x = 0; x < width; x++) {
-          inv_angle_sum_0 += modedisp2invsampledisp[abs(mode_disp)];
-          int delta_pos_0 = inv_angle_sum_0 >> 2;
-          int delta_frac_0 = delta_pos_0 & 63;
-          int delta_int_0 = delta_pos_0 >> 6;
-          int delta_y = y + delta_int_0 + 1;
-          // TODO: convert to JVET_K0500_WAIP
-          if (delta_y > width + width - 1) break;
-
-          int wL = 32 >> MIN(31, ((x << 1) >> scale));
-          if (wL == 0) break;
-          const uvg_pixel *p = ref_side + delta_y - 1;
-          uvg_pixel left = p[delta_frac_0 >> 5];
-          dst[y * width + x] = CLIP_TO_PIXEL((wL * left + (64 - wL) * dst[y * width + x] + 32) >> 6);
-        }
-      }*/
     }
   }
   else {
@@ -312,6 +273,7 @@ static void uvg_angular_pred_generic(
     bool do_pdpc = (((tmp_width >= 4 && tmp_height >= 4) || channel_type != 0) && sample_disp >= 0 && multi_ref_index == 0 /*&& !bdpcm*/);
 
     if (do_pdpc) {
+      if (!vertical_mode) {SWAP(width, height, int)}
       int scale = (log2_width + log2_height - 2) >> 2;
       const uvg_pixel top_left = ref_main[0];
       for (int_fast32_t y = 0; y < tmp_height; ++y) {
@@ -332,9 +294,9 @@ static void uvg_angular_pred_generic(
 
   // Flip the block if this is was a horizontal mode.
   if (!vertical_mode) {
-    for (int_fast32_t y = 0; y < tmp_height; ++y) {
-      for (int_fast32_t x = 0; x < tmp_width; ++x) {
-        dst[x * width + y] = tmp_dst[y * tmp_width + x];
+    for (int_fast32_t y = 0; y < height - 1; ++y) {
+      for (int_fast32_t x = y + 1; x < width; ++x) {
+        SWAP(dst[y * height + x], dst[x * width + y], uvg_pixel);
       }
     }
   }

From b893a9268c094383486e7b279e482fb7704a39e8 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Fri, 18 Nov 2022 14:00:01 +0200
Subject: [PATCH 118/254] [mtt] WIP

---
 src/cu.c                               | 21 +++++++++++++
 src/cu.h                               |  3 ++
 src/encode_coding_tree.c               | 18 +++++++++--
 src/intra.c                            | 23 +++++++++-----
 src/intra.h                            |  1 +
 src/search.c                           |  2 +-
 src/search_intra.c                     | 22 ++++----------
 src/strategies/generic/intra-generic.c | 42 +++++++++++++++++---------
 8 files changed, 90 insertions(+), 42 deletions(-)

diff --git a/src/cu.c b/src/cu.c
index 0256bd3d..8998dafd 100644
--- a/src/cu.c
+++ b/src/cu.c
@@ -357,4 +357,25 @@ int uvg_get_split_locs(
       return 3;
   }
   return 0;
+}
+
+int uvg_count_available_edge_cus(const cu_loc_t* const cu_loc, const lcu_t* const lcu, bool left)
+{
+  if ((left && cu_loc->x == 0) || (!left && cu_loc->y == 0)) {
+    return 0;
+  }
+  if (left && cu_loc->local_x == 0) return (LCU_CU_WIDTH - cu_loc->local_y) / 4;
+  if (!left && cu_loc->local_y == 0) return (LCU_CU_WIDTH - cu_loc->local_x) / 4;
+
+  int amount = 0;
+  if(left) {
+    while (LCU_GET_CU_AT_PX(lcu, cu_loc->local_x - TR_MIN_WIDTH, cu_loc->local_y + amount * TR_MIN_WIDTH)->type != CU_NOTSET) {
+      amount++;
+    }
+    return amount;
+  }
+  while (LCU_GET_CU_AT_PX(lcu, cu_loc->local_x + amount * TR_MIN_WIDTH, cu_loc->local_y - TR_MIN_WIDTH)->type != CU_NOTSET) {
+    amount++;
+  }
+  return amount;
 }
\ No newline at end of file
diff --git a/src/cu.h b/src/cu.h
index cc2f6925..6440f6f2 100644
--- a/src/cu.h
+++ b/src/cu.h
@@ -191,6 +191,7 @@ int uvg_get_split_locs(
   enum split_type split,
   cu_loc_t out[4]);
 
+
 #define CU_GET_MV_CAND(cu_info_ptr, reflist) \
   (((reflist) == 0) ? (cu_info_ptr)->inter.mv_cand0 : (cu_info_ptr)->inter.mv_cand1)
 
@@ -370,6 +371,8 @@ typedef struct {
 void uvg_cu_array_copy_from_lcu(cu_array_t* dst, int dst_x, int dst_y, const lcu_t *src, enum uvg_tree_type
                                 tree_type);
 
+int uvg_count_available_edge_cus(const cu_loc_t* const cu_loc, const lcu_t* const lcu, bool left);
+
 /**
  * \brief Return pointer to the top right reference CU.
  */
diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index ac8d206e..bacaf38b 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -1263,7 +1263,7 @@ uint8_t uvg_write_split_flag(
 
   bool allow_split = allow_qt | bh_split | bv_split | th_split | tv_split;
 
-  int split_flag = (split_tree.split_tree >> (split_tree.current_depth * 3)) & 7;
+  enum split_type split_flag = (split_tree.split_tree >> (split_tree.current_depth * 3)) & 7;
 
   split_flag = implicit_split_mode != UVG_NO_SPLIT ? implicit_split_mode : split_flag;
 
@@ -1298,7 +1298,19 @@ uint8_t uvg_write_split_flag(
   if (implicit_split_mode == UVG_NO_SPLIT && allow_qt && (bh_split || bv_split || th_split || tv_split) && split_flag != NO_SPLIT) {
     bool qt_split = split_flag == QT_SPLIT;
     if((bv_split || bh_split || tv_split || th_split) && allow_qt) {
-      split_model = (left_cu && GET_SPLITDATA(left_cu, split_tree.current_depth)) + (above_cu && GET_SPLITDATA(above_cu, split_tree.current_depth)) + (split_tree.current_depth < 2 ? 0 : 3);
+      unsigned left_qt_depth = 0;
+      unsigned top_qt_depth = 0;
+      if(left_cu) {
+        while (((left_cu->split_tree >> left_qt_depth) & 7u) == QT_SPLIT) {
+          left_qt_depth++;
+        }
+      }
+      if(above_cu) {
+        while (((above_cu->split_tree >> top_qt_depth) & 7u) == QT_SPLIT) {
+          top_qt_depth++;
+        }
+      }
+      split_model = (left_cu && (left_qt_depth > split_tree.current_depth)) + (above_cu && (top_qt_depth > split_tree.current_depth)) + (split_tree.current_depth < 2 ? 0 : 3);
       CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_split_flag_model[split_model]), qt_split, bits, "qt_split_flag");
     }
     if (!qt_split) {
@@ -1319,7 +1331,7 @@ uint8_t uvg_write_split_flag(
         CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.mtt_vertical_model[split_model]), is_vertical, bits, "mtt_vertical_flag");
       }
       if ((bv_split && tv_split && is_vertical) || (bh_split && th_split && !is_vertical)) {
-        split_model = 2 * is_vertical + split_tree.mtt_depth <= 1;
+        split_model = (2 * is_vertical) + (split_tree.mtt_depth <= 1);
         CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.mtt_binary_model[split_model]), 
           split_flag == BT_VER_SPLIT || split_flag == BT_HOR_SPLIT, bits, "mtt_binary_flag");
       }
diff --git a/src/intra.c b/src/intra.c
index d6f9ad3f..aacd238f 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -985,6 +985,7 @@ static void intra_predict_regular(
 
 
 void uvg_intra_build_reference_any(
+  const encoder_state_t* const state,
   const cu_loc_t* const pu_loc,
   const cu_loc_t* const cu_loc,
   const color_t color,
@@ -1019,6 +1020,7 @@ void uvg_intra_build_reference_any(
 
   const uvg_pixel dc_val = 1 << (UVG_BIT_DEPTH - 1); //TODO: add used bitdepth as a variable
   const int is_chroma = color != COLOR_Y ? 1 : 0;
+  const int is_dual_tree = is_chroma && state->encoder_control->cfg.dual_tree && state->frame->is_irap;
 
   // Get multi ref index from CU under prediction or reconstrcution. Do not use MRL if not luma
   const uint8_t multi_ref_index = !is_chroma ? multi_ref_idx : 0;
@@ -1091,7 +1093,8 @@ void uvg_intra_build_reference_any(
       }
     }
     else {
-      px_available_left = num_ref_pixels_left[lcu_px.y / 4][lcu_px.x / 4] >> is_chroma;
+      const int num_cus = uvg_count_available_edge_cus(cu_loc, lcu, true);      
+      px_available_left = is_dual_tree || !is_chroma ? num_cus * 4 : num_cus *2;
     }
 
     // Limit the number of available pixels based on block size and dimensions
@@ -1212,7 +1215,8 @@ void uvg_intra_build_reference_any(
       }
     }
     else {
-      px_available_top = num_ref_pixels_top[lcu_px.y / 4][lcu_px.x / 4] >> is_chroma;
+      const int num_cus = uvg_count_available_edge_cus(cu_loc, lcu, false);
+      px_available_top = is_dual_tree || !is_chroma ? num_cus * 4 : num_cus * 2;
     }
     
     // Limit the number of available pixels based on block size and dimensions
@@ -1245,6 +1249,7 @@ void uvg_intra_build_reference_any(
 }
 
 void uvg_intra_build_reference_inner(
+  const encoder_state_t* const state,
   const cu_loc_t* const pu_loc,
   const cu_loc_t* const cu_loc,
   const color_t color,
@@ -1280,6 +1285,7 @@ void uvg_intra_build_reference_inner(
   uvg_pixel * __restrict out_top_ref = &refs->ref.top[0];
 
   const int is_chroma = color != COLOR_Y ? 1 : 0;
+  const int is_dual_tree = is_chroma && state->encoder_control->cfg.dual_tree && state->frame->is_irap;
 
   // Get multiRefIdx from CU under prediction. Do not use MRL if not luma
   const uint8_t multi_ref_index = !is_chroma ? multi_ref_idx : 0;
@@ -1395,7 +1401,8 @@ void uvg_intra_build_reference_inner(
 
   }
   else {
-    px_available_left = num_ref_pixels_left[lcu_px.y / 4][lcu_px.x / 4] >> is_chroma;
+    const int num_cus = uvg_count_available_edge_cus(cu_loc, lcu, true);
+    px_available_left = is_dual_tree || !is_chroma ? num_cus * 4 : num_cus * 2;
   }
 
   // Limit the number of available pixels based on block size and dimensions
@@ -1456,7 +1463,8 @@ void uvg_intra_build_reference_inner(
     }
   }
   else {
-    px_available_top = num_ref_pixels_top[lcu_px.y / 4][lcu_px.x / 4] >> is_chroma;
+    const int num_cus = uvg_count_available_edge_cus(cu_loc, lcu, true);
+    px_available_top = is_dual_tree || !is_chroma ? num_cus * 4 : num_cus * 2;
   }
 
   // Limit the number of available pixels based on block size and dimensions
@@ -1488,6 +1496,7 @@ void uvg_intra_build_reference_inner(
 
 
 void uvg_intra_build_reference(
+  const encoder_state_t* const state,
   const cu_loc_t* const pu_loc,
   const cu_loc_t* const cu_loc,
   const color_t color,
@@ -1507,9 +1516,9 @@ void uvg_intra_build_reference(
 
   // Much logic can be discarded if not on the edge
   if (luma_px->x > 0 && luma_px->y > 0) {
-    uvg_intra_build_reference_inner(pu_loc, cu_loc, color, luma_px, pic_px, lcu, refs, entropy_sync, multi_ref_idx, extra_ref_lines, isp_mode);
+    uvg_intra_build_reference_inner(state, pu_loc, cu_loc, color, luma_px, pic_px, lcu, refs, entropy_sync, multi_ref_idx, extra_ref_lines, isp_mode);
   } else {
-    uvg_intra_build_reference_any(pu_loc, cu_loc, color, luma_px, pic_px, lcu, refs, multi_ref_idx, extra_ref_lines, isp_mode);
+    uvg_intra_build_reference_any(state, pu_loc, cu_loc, color, luma_px, pic_px, lcu, refs, multi_ref_idx, extra_ref_lines, isp_mode);
   }
 }
 
@@ -1721,7 +1730,7 @@ static void intra_recon_tb_leaf(
     }
   }
 
-  uvg_intra_build_reference(pu_loc, cu_loc, color, &luma_px, &pic_px, lcu, &refs, cfg->wpp, extra_refs, multi_ref_index, isp_mode);
+  uvg_intra_build_reference(state, pu_loc, cu_loc, color, &luma_px, &pic_px, lcu, &refs, cfg->wpp, extra_refs, multi_ref_index, isp_mode);
 
   uvg_pixel pred[32 * 32];
   uvg_intra_predict(state, &refs, pu_loc, color, pred, search_data, lcu, tree_type);
diff --git a/src/intra.h b/src/intra.h
index deeb173d..7ef5357b 100644
--- a/src/intra.h
+++ b/src/intra.h
@@ -108,6 +108,7 @@ int8_t uvg_intra_get_dir_luma_predictor(
 * \param multi_ref_idx Multi reference line index for the prediction block.
 */
 void uvg_intra_build_reference(
+  const encoder_state_t* const state,
   const cu_loc_t* const pu_loc,
   const cu_loc_t* const cu_loc,
   const color_t color,
diff --git a/src/search.c b/src/search.c
index 25a2ea1c..090f5f16 100644
--- a/src/search.c
+++ b/src/search.c
@@ -1335,7 +1335,7 @@ static double search_cu(
 
   // Recursively split all the way to max search depth.
   if (can_split_cu) {
-    const int split_type = depth == 0 ? QT_SPLIT : BT_HOR_SPLIT;
+    const int split_type = depth == 0 ? QT_SPLIT : BT_VER_SPLIT;
     const split_tree_t new_split = {
       split_tree.split_tree | split_type << (split_tree.current_depth * 3),
       split_tree.current_depth + 1,
diff --git a/src/search_intra.c b/src/search_intra.c
index d08b9d64..792bc1fc 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -294,14 +294,6 @@ static double search_intra_trdepth(
 
   const bool reconstruct_chroma = false;// (depth != 4 || (depth == 4 && (x_px & 4 && y_px & 4))) && state->encoder_control->chroma_format != UVG_CSP_400;
   cu_info_t* pred_cu = &search_data->pred_cu;
-  cu_info_t* const tr_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y);
-
-  struct {
-    uvg_pixel y[TR_MAX_WIDTH*TR_MAX_WIDTH];
-    uvg_pixel u[TR_MAX_WIDTH*TR_MAX_WIDTH];
-    uvg_pixel v[TR_MAX_WIDTH*TR_MAX_WIDTH];
-  } nosplit_pixels;
-  uint16_t nosplit_cbf = 0;
 
   double split_cost = INT32_MAX;
   double nosplit_cost = INT32_MAX;
@@ -574,8 +566,6 @@ static double search_intra_trdepth(
     if (nosplit_cost >= cost_treshold) {
       return nosplit_cost;
     }
-
-    nosplit_cbf = pred_cu->cbf;
   }
     
   
@@ -648,10 +638,10 @@ static int search_intra_chroma_rough(
   const cu_loc_t loc = { luma_px.x, luma_px.y, width, height, width, height };
 
   uvg_intra_references refs_u;
-  uvg_intra_build_reference(&loc, &loc, COLOR_U, &luma_px, &pic_px, lcu, &refs_u, state->encoder_control->cfg.wpp, NULL, 0, 0);
+  uvg_intra_build_reference(state, &loc, &loc, COLOR_U, &luma_px, &pic_px, lcu, &refs_u, state->encoder_control->cfg.wpp, NULL, 0, 0);
 
   uvg_intra_references refs_v;
-  uvg_intra_build_reference(&loc, &loc, COLOR_V, &luma_px, &pic_px, lcu, &refs_v, state->encoder_control->cfg.wpp, NULL, 0, 0);
+  uvg_intra_build_reference(state, &loc, &loc, COLOR_V, &luma_px, &pic_px, lcu, &refs_v, state->encoder_control->cfg.wpp, NULL, 0, 0);
 
   vector2d_t lcu_cpx = { (cu_loc->local_x & ~7) / 2, (cu_loc->local_y & ~7) / 2 };
   uvg_pixel* orig_u = &lcu->ref.u[lcu_cpx.x + lcu_cpx.y * LCU_WIDTH_C];
@@ -1447,8 +1437,8 @@ int8_t uvg_search_intra_chroma_rdo(
 
 
   if (reconstruct_chroma) {
-    uvg_intra_build_reference(cu_loc, cu_loc, COLOR_U, &luma_px, &pic_px, lcu, &refs[0], state->encoder_control->cfg.wpp, NULL, 0, 0);
-    uvg_intra_build_reference(cu_loc, cu_loc, COLOR_V, &luma_px, &pic_px, lcu, &refs[1], state->encoder_control->cfg.wpp, NULL, 0, 0);
+    uvg_intra_build_reference(state, cu_loc, cu_loc, COLOR_U, &luma_px, &pic_px, lcu, &refs[0], state->encoder_control->cfg.wpp, NULL, 0, 0);
+    uvg_intra_build_reference(state, cu_loc, cu_loc, COLOR_V, &luma_px, &pic_px, lcu, &refs[1], state->encoder_control->cfg.wpp, NULL, 0, 0);
     
     const vector2d_t lcu_px = { cu_loc->local_x, cu_loc->local_y };
     cabac_data_t temp_cabac;
@@ -1783,7 +1773,7 @@ void uvg_search_cu_intra(
 
   bool is_large = cu_loc->width > TR_MAX_WIDTH || cu_loc->height > TR_MAX_WIDTH;
   if (!is_large) {
-    uvg_intra_build_reference(cu_loc, cu_loc, COLOR_Y, &luma_px, &pic_px, lcu, refs, state->encoder_control->cfg.wpp, NULL, 0, 0);
+    uvg_intra_build_reference(state, cu_loc, cu_loc, COLOR_Y, &luma_px, &pic_px, lcu, refs, state->encoder_control->cfg.wpp, NULL, 0, 0);
   }
   
   // This is needed for bit cost calculation and requires too many parameters to be
@@ -1848,7 +1838,7 @@ void uvg_search_cu_intra(
           frame->rec->stride, 1);
       }
     }
-    uvg_intra_build_reference(cu_loc, cu_loc, COLOR_Y, &luma_px, &pic_px, lcu, &refs[line], state->encoder_control->cfg.wpp, extra_refs, line, 0);
+    uvg_intra_build_reference(state, cu_loc, cu_loc, COLOR_Y, &luma_px, &pic_px, lcu, &refs[line], state->encoder_control->cfg.wpp, extra_refs, line, 0);
     for(int i = 1; i < INTRA_MPM_COUNT; i++) {
       num_mrl_modes++;
       const int index = (i - 1) + (INTRA_MPM_COUNT -1)*(line-1) + number_of_modes;
diff --git a/src/strategies/generic/intra-generic.c b/src/strategies/generic/intra-generic.c
index f07fa020..6e712bf5 100644
--- a/src/strategies/generic/intra-generic.c
+++ b/src/strategies/generic/intra-generic.c
@@ -113,6 +113,9 @@ static void uvg_angular_pred_generic(
                                                     // Temporary buffer for modes 11-25.
                                                     // It only needs to be big enough to hold indices from -width to width-1.
 
+  uvg_pixel temp_dst[TR_MAX_WIDTH * TR_MAX_WIDTH];
+
+
   // TODO: check the correct size for these arrays when MRL is used
   //uvg_pixel tmp_ref[2 * 128 + 3 + 33 * MAX_REF_LINE_IDX] = { 0 };
   uvg_pixel temp_above[2 * 128 + 3 + 33 * MAX_REF_LINE_IDX] = { 0 };
@@ -138,6 +141,7 @@ static void uvg_angular_pred_generic(
   uvg_pixel *ref_main;
   // Pointer for the other reference.
   const uvg_pixel *ref_side;
+  uvg_pixel* work = width == height || vertical_mode ? dst : temp_dst;
 
   const int cu_dim = MAX(width, height);
   const int top_ref_length  = isp_mode ? width + cu_dim  : width << 1;
@@ -184,6 +188,7 @@ static void uvg_angular_pred_generic(
   // compensate for line offset in reference line buffers
   ref_main += multi_ref_index;
   ref_side += multi_ref_index;
+  if (!vertical_mode) { SWAP(width, height, int) }
 
   if (sample_disp != 0) {
     // The mode is not horizontal or vertical, we have to do interpolation.
@@ -221,7 +226,7 @@ static void uvg_angular_pred_generic(
             p[2] = ref_main[ref_main_index + 2];
             p[3] = ref_main[ref_main_index + 3];
          
-            dst_buf[y * tmp_width + x] = CLIP_TO_PIXEL(((int32_t)(f[0] * p[0]) + (int32_t)(f[1] * p[1]) + (int32_t)(f[2] * p[2]) + (int32_t)(f[3] * p[3]) + 32) >> 6);
+            work[y * width + x] = CLIP_TO_PIXEL(((int32_t)(f[0] * p[0]) + (int32_t)(f[1] * p[1]) + (int32_t)(f[2] * p[2]) + (int32_t)(f[3] * p[3]) + 32) >> 6);
 
           }
         }
@@ -231,14 +236,14 @@ static void uvg_angular_pred_generic(
           for (int_fast32_t x = 0; x < tmp_width; ++x) {
             uvg_pixel ref1 = ref_main[x + delta_int + 1];
             uvg_pixel ref2 = ref_main[x + delta_int + 2];
-            dst_buf[y * tmp_width + x] = ref1 + ((delta_fract * (ref2-ref1) + 16) >> 5);
+            work[y * width + x] = ref1 + ((delta_fract * (ref2-ref1) + 16) >> 5);
           }
         }
       }
       else {
         // Just copy the integer samples
-        for (int_fast32_t x = 0; x < tmp_width; x++) {
-          dst_buf[y * tmp_width + x] = ref_main[x + delta_int + 1];
+        for (int_fast32_t x = 0; x < width; x++) {
+          work[y * width + x] = ref_main[x + delta_int + 1];
         }
       }
 
@@ -260,7 +265,7 @@ static void uvg_angular_pred_generic(
 
           int wL = 32 >> (2 * x >> scale);
           const uvg_pixel left = ref_side[y + (inv_angle_sum >> 9) + 1];
-          dst_buf[y * tmp_width + x] = dst_buf[y * tmp_width + x] + ((wL * (left - dst_buf[y * tmp_width + x]) + 32) >> 6);
+          work[y * width + x] = work[y * width + x] + ((wL * (left - work[y * width + x]) + 32) >> 6);
         }
       }
     }
@@ -273,30 +278,37 @@ static void uvg_angular_pred_generic(
     bool do_pdpc = (((tmp_width >= 4 && tmp_height >= 4) || channel_type != 0) && sample_disp >= 0 && multi_ref_index == 0 /*&& !bdpcm*/);
 
     if (do_pdpc) {
-      if (!vertical_mode) {SWAP(width, height, int)}
       int scale = (log2_width + log2_height - 2) >> 2;
       const uvg_pixel top_left = ref_main[0];
-      for (int_fast32_t y = 0; y < tmp_height; ++y) {
-        memcpy(&dst_buf[y * tmp_width], &ref_main[1], tmp_width * sizeof(uvg_pixel));
+      for (int_fast32_t y = 0; y < height; ++y) {
+        memcpy(&work[y * width], &ref_main[1], width * sizeof(uvg_pixel));
         const uvg_pixel left = ref_side[1 + y];
         for (int_fast32_t x = 0; x < MIN(3 << scale, tmp_width); ++x) {
           const int wL = 32 >> (2 * x >> scale);
-          const uvg_pixel val = dst_buf[y * tmp_width + x];
-          dst_buf[y * tmp_width + x] = CLIP_TO_PIXEL(val + ((wL * (left - top_left) + 32) >> 6));
+          const uvg_pixel val = work[y * width + x];
+          work[y * width + x] = CLIP_TO_PIXEL(val + ((wL * (left - top_left) + 32) >> 6));
         }
       }
     } else {
-      for (int_fast32_t y = 0; y < tmp_height; ++y) {
-        memcpy(&dst_buf[y * tmp_width], &ref_main[1], tmp_width * sizeof(uvg_pixel));
+      for (int_fast32_t y = 0; y < height; ++y) {
+        memcpy(&work[y * width], &ref_main[1], width * sizeof(uvg_pixel));
       }
     }
   }
 
   // Flip the block if this is was a horizontal mode.
   if (!vertical_mode) {
-    for (int_fast32_t y = 0; y < height - 1; ++y) {
-      for (int_fast32_t x = y + 1; x < width; ++x) {
-        SWAP(dst[y * height + x], dst[x * width + y], uvg_pixel);
+    if(width == height) {
+      for (int_fast32_t y = 0; y < height - 1; ++y) {
+        for (int_fast32_t x = y + 1; x < width; ++x) {
+          SWAP(work[y * height + x], work[x * width + y], uvg_pixel);
+        }
+      }
+    } else {
+      for(int y = 0; y < width; ++y) {
+        for(int x = 0; x < height; ++x) {
+          dst[x + y * height] = work[y + x * width];
+        }
       }
     }
   }

From 5875dc1ef49e32ae5b1d96e4cb5239fa47e28b44 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Mon, 21 Nov 2022 09:15:43 +0200
Subject: [PATCH 119/254] [mtt] Fix counting the number of reference pixles and
 implement WAIP adjustment

---
 src/cu.c                               | 16 ++++++++--------
 src/intra.c                            | 23 +++++++++++++++++++----
 src/strategies/generic/intra-generic.c |  2 +-
 src/strategies/generic/quant-generic.c |  6 +++---
 4 files changed, 31 insertions(+), 16 deletions(-)

diff --git a/src/cu.c b/src/cu.c
index 8998dafd..8c806733 100644
--- a/src/cu.c
+++ b/src/cu.c
@@ -364,18 +364,18 @@ int uvg_count_available_edge_cus(const cu_loc_t* const cu_loc, const lcu_t* cons
   if ((left && cu_loc->x == 0) || (!left && cu_loc->y == 0)) {
     return 0;
   }
-  if (left && cu_loc->local_x == 0) return (LCU_CU_WIDTH - cu_loc->local_y) / 4;
-  if (!left && cu_loc->local_y == 0) return (LCU_CU_WIDTH - cu_loc->local_x) / 4;
+  if (left && cu_loc->local_x == 0) return (LCU_WIDTH - cu_loc->local_y) / 4;
+  if (!left && cu_loc->local_y == 0) return (cu_loc->width) / 2;
 
   int amount = 0;
   if(left) {
-    while (LCU_GET_CU_AT_PX(lcu, cu_loc->local_x - TR_MIN_WIDTH, cu_loc->local_y + amount * TR_MIN_WIDTH)->type != CU_NOTSET) {
-      amount++;
+    while (LCU_GET_CU_AT_PX(lcu, cu_loc->local_x - TR_MIN_WIDTH, cu_loc->local_y + amount)->type != CU_NOTSET && (cu_loc->local_y + amount) < LCU_WIDTH) {
+      amount += TR_MIN_WIDTH;
     }
-    return amount;
+    return amount / TR_MIN_WIDTH;
   }
-  while (LCU_GET_CU_AT_PX(lcu, cu_loc->local_x + amount * TR_MIN_WIDTH, cu_loc->local_y - TR_MIN_WIDTH)->type != CU_NOTSET) {
-    amount++;
+  while (LCU_GET_CU_AT_PX(lcu, cu_loc->local_x + amount, cu_loc->local_y - TR_MIN_WIDTH)->type != CU_NOTSET && cu_loc->local_x + amount < LCU_WIDTH) {
+    amount += TR_MIN_WIDTH;
   }
-  return amount;
+  return amount / TR_MIN_WIDTH;
 }
\ No newline at end of file
diff --git a/src/intra.c b/src/intra.c
index aacd238f..8e750a00 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -936,6 +936,21 @@ static void intra_predict_regular(
   uint8_t multi_ref_index = color == COLOR_Y ? multi_ref_idx : 0;
   uint8_t isp = color == COLOR_Y ? isp_mode : 0;
 
+  // Wide angle correction
+  int8_t pred_mode = mode;
+  if (!is_isp && width != height) {
+    if (mode > 1 && mode <= 66) {
+      const int modeShift[] = { 0, 6, 10, 12, 14, 15 };
+      const int deltaSize = abs(log2_width - log2_height);
+      if (width > height && mode < 2 + modeShift[deltaSize]) {
+        pred_mode += (66 - 1);
+      }
+      else if (height > width && mode > 66 - modeShift[deltaSize]) {
+        pred_mode -= (66 - 1);
+      }
+    }
+  }
+
   const uvg_intra_ref *used_ref = &refs->ref;
   if (cfg->intra_smoothing_disabled || color != COLOR_Y || mode == 1 || (width == 4 && height == 4) || multi_ref_index || isp_mode /*ISP_TODO: replace this fake ISP check*/) {
     // For chroma, DC and 4x4 blocks, always use unfiltered reference.
@@ -949,11 +964,11 @@ static void intra_predict_regular(
     // to being either vertical or horizontal.
     static const int uvg_intra_hor_ver_dist_thres[8] = {24, 24, 24, 14, 2, 0, 0, 0 };
     int filter_threshold = uvg_intra_hor_ver_dist_thres[(log2_width + log2_height) >> 1];
-    int dist_from_vert_or_hor = MIN(abs(mode - 50), abs(mode - 18));
+    int dist_from_vert_or_hor = MIN(abs(pred_mode - 50), abs(pred_mode - 18));
     if (dist_from_vert_or_hor > filter_threshold) {
 
       static const int16_t modedisp2sampledisp[32] = { 0,    1,    2,    3,    4,    6,     8,   10,   12,   14,   16,   18,   20,   23,   26,   29,   32,   35,   39,  45,  51,  57,  64,  73,  86, 102, 128, 171, 256, 341, 512, 1024 };
-      const int_fast8_t mode_disp = (mode >= 34) ? mode - 50 : 18 - mode;
+      const int_fast8_t mode_disp = (pred_mode >= 34) ? pred_mode - 50 : 18 - pred_mode;
       const int_fast8_t sample_disp = (mode_disp < 0 ? -1 : 1) * modedisp2sampledisp[abs(mode_disp)];
       if ((abs(sample_disp) & 0x1F) == 0) {
         used_ref = &refs->filtered_ref;
@@ -970,7 +985,7 @@ static void intra_predict_regular(
   } else if (mode == 1) {
     intra_pred_dc(cu_loc, color, used_ref->top, used_ref->left, dst, multi_ref_index);
   } else {
-    uvg_angular_pred(cu_loc, mode, color, used_ref->top, used_ref->left, dst, multi_ref_index, isp);
+    uvg_angular_pred(cu_loc, pred_mode, color, used_ref->top, used_ref->left, dst, multi_ref_index, isp);
   }
 
   // pdpc
@@ -1463,7 +1478,7 @@ void uvg_intra_build_reference_inner(
     }
   }
   else {
-    const int num_cus = uvg_count_available_edge_cus(cu_loc, lcu, true);
+    const int num_cus = uvg_count_available_edge_cus(cu_loc, lcu, false);
     px_available_top = is_dual_tree || !is_chroma ? num_cus * 4 : num_cus * 2;
   }
 
diff --git a/src/strategies/generic/intra-generic.c b/src/strategies/generic/intra-generic.c
index 6e712bf5..84373d21 100644
--- a/src/strategies/generic/intra-generic.c
+++ b/src/strategies/generic/intra-generic.c
@@ -68,7 +68,7 @@ static void uvg_angular_pred_generic(
   
   // Log2_dim 1 is possible with ISP blocks
   assert((log2_width >= 1 && log2_width <= 5) && (log2_height >= 1 && log2_height <= 5));
-  assert(intra_mode >= 2 && intra_mode <= 66);
+  // assert(intra_mode >= 2 && intra_mode <= 66);
 
   static const int16_t modedisp2sampledisp[32] = { 0,    1,    2,    3,    4,    6,     8,   10,   12,   14,   16,   18,   20,   23,   26,   29,   32,   35,   39,  45,  51,  57,  64,  73,  86, 102, 128, 171, 256, 341, 512, 1024 };
   static const int16_t modedisp2invsampledisp[32] = { 0, 16384, 8192, 5461, 4096, 2731, 2048, 1638, 1365, 1170, 1024, 910, 819, 712, 630, 565, 512, 468, 420, 364, 321, 287, 256, 224, 191, 161, 128, 96, 64, 48, 32, 16 }; // (512 * 32) / sampledisp
diff --git a/src/strategies/generic/quant-generic.c b/src/strategies/generic/quant-generic.c
index daa302a3..7dc110d7 100644
--- a/src/strategies/generic/quant-generic.c
+++ b/src/strategies/generic/quant-generic.c
@@ -375,7 +375,7 @@ int uvg_quant_cbcr_residual_generic(
     //}
     const int temp = cur_cu->joint_cb_cr * (state->frame->jccr_sign ? -1 : 1);
     // Get quantized reconstruction. (residual + pred_in -> rec_out)
-    for (int y = 0; y < width; y++) {
+    for (int y = 0; y < height; y++) {
       for (int x = 0; x < width; x++) {
         if (temp == 2) {
           u_residual[x + y * width] = combined_residual[x + y * width];
@@ -404,7 +404,7 @@ int uvg_quant_cbcr_residual_generic(
         }
       }
     }
-    for (int y = 0; y < width; ++y) {
+    for (int y = 0; y < height; ++y) {
       for (int x = 0; x < width; ++x) {
         int16_t u_val = u_residual[x + y * width] + u_pred_in[x + y * in_stride];
         u_rec_out[x + y * out_stride] = (uvg_pixel)CLIP(0, PIXEL_MAX, u_val);
@@ -417,7 +417,7 @@ int uvg_quant_cbcr_residual_generic(
     // With no coeffs and rec_out == pred_int we skip copying the coefficients
     // because the reconstruction is just the prediction.
 
-    for (int y = 0; y < width; ++y) {
+    for (int y = 0; y < height; ++y) {
       for (int x = 0; x < width; ++x) {
         u_rec_out[x + y * out_stride] = u_pred_in[x + y * in_stride];
         v_rec_out[x + y * out_stride] = v_pred_in[x + y * in_stride];

From ab21c7e1d73bb10602b510682930b81b39ba7f73 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Mon, 21 Nov 2022 12:37:55 +0200
Subject: [PATCH 120/254] [mtt] Fix sqrt adjustment, cclm calculation on edges
 of CTU and waip for lfnst

---
 src/intra.c                            | 39 +++++++++-------
 src/intra.h                            |  6 +++
 src/rdo.c                              |  2 +-
 src/search.c                           |  2 +-
 src/strategies/avx2/quant-avx2.c       |  4 +-
 src/strategies/generic/quant-generic.c |  4 +-
 src/transform.c                        | 62 +++++++++++++++++---------
 7 files changed, 77 insertions(+), 42 deletions(-)

diff --git a/src/intra.c b/src/intra.c
index 8e750a00..2c387a64 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -532,7 +532,7 @@ static void predict_cclm(
   const int ctu_size = tree_type == UVG_CHROMA_T ? LCU_WIDTH_C : LCU_WIDTH;
 
   if (y0) {
-    if (y_scu == 0) available_above_right = MIN(MIN(width / 2, (64-x_scu - width * 2) / 2), (state->tile->frame->width - x0 - width* 2) / 2);
+    if (y_scu == 0) available_above_right = MIN(MIN(width / 2, (64-x_scu - width * 2) / 4), (state->tile->frame->width - x0 - width* 2) / 4);
     for (; available_above_right < width / 2; available_above_right++) {
       int x_extension = x_scu + width * 2 + 4 * available_above_right;
       x_extension >>= tree_type == UVG_CHROMA_T;
@@ -559,7 +559,7 @@ static void predict_cclm(
   }
 
   if(x0) {
-    if (x_scu == 0) available_left_below = MIN(MIN(height / 2, (64 - y_scu - height * 2) / 2), (state->tile->frame->height - y0 - height * 2) / 2);
+    if (x_scu == 0) available_left_below = MIN(MIN(height / 2, (64 - y_scu - height * 2) / 4), (state->tile->frame->height - y0 - height * 2) / 4);
     for (; available_left_below < height / 2; available_left_below++) {
       int y_extension = y_scu + height * 2 + 4 * available_left_below;
       y_extension >>= tree_type == UVG_CHROMA_T;
@@ -916,6 +916,24 @@ static void mip_predict(
 }
 
 
+int8_t uvg_wide_angle_correction(int_fast8_t mode, const bool is_isp, const int log2_width, const int log2_height)
+{
+  int8_t pred_mode = mode;
+  if (!is_isp && log2_width != log2_height) {
+    if (mode > 1 && mode <= 66) {
+      const int modeShift[] = { 0, 6, 10, 12, 14, 15 };
+      const int deltaSize = abs(log2_width - log2_height);
+      if (log2_width > log2_height && mode < 2 + modeShift[deltaSize]) {
+        pred_mode += (66 - 1);
+      }
+      else if (log2_height > log2_width && mode > 66 - modeShift[deltaSize]) {
+        pred_mode -= (66 - 1);
+      }
+    }
+  }
+  return pred_mode;
+}
+
 static void intra_predict_regular(
   const encoder_state_t* const state,
   uvg_intra_references *refs,
@@ -937,19 +955,10 @@ static void intra_predict_regular(
   uint8_t isp = color == COLOR_Y ? isp_mode : 0;
 
   // Wide angle correction
-  int8_t pred_mode = mode;
-  if (!is_isp && width != height) {
-    if (mode > 1 && mode <= 66) {
-      const int modeShift[] = { 0, 6, 10, 12, 14, 15 };
-      const int deltaSize = abs(log2_width - log2_height);
-      if (width > height && mode < 2 + modeShift[deltaSize]) {
-        pred_mode += (66 - 1);
-      }
-      else if (height > width && mode > 66 - modeShift[deltaSize]) {
-        pred_mode -= (66 - 1);
-      }
-    }
-  }
+  int8_t pred_mode = uvg_wide_angle_correction(mode,
+                                               is_isp,
+                                               log2_width,
+                                               log2_height);
 
   const uvg_intra_ref *used_ref = &refs->ref;
   if (cfg->intra_smoothing_disabled || color != COLOR_Y || mode == 1 || (width == 4 && height == 4) || multi_ref_index || isp_mode /*ISP_TODO: replace this fake ISP check*/) {
diff --git a/src/intra.h b/src/intra.h
index 7ef5357b..7c4c8852 100644
--- a/src/intra.h
+++ b/src/intra.h
@@ -165,6 +165,12 @@ uint8_t uvg_get_mip_flag_context(
   const lcu_t* lcu,
   cu_array_t* const cu_a);
 
+int8_t uvg_wide_angle_correction(
+  int_fast8_t mode,
+  const bool is_isp,
+  const int log2_width,
+  const int log2_height);
+
 // ISP related defines
 #define NUM_ISP_MODES 3
 #define ISP_MODE_NO_ISP 0
diff --git a/src/rdo.c b/src/rdo.c
index 791b5696..6122269d 100644
--- a/src/rdo.c
+++ b/src/rdo.c
@@ -1418,7 +1418,7 @@ void uvg_rdoq(
   const uint32_t log2_block_width = uvg_g_convert_to_log2[width];
   const uint32_t log2_block_height = uvg_g_convert_to_log2[height];
   bool needs_block_size_trafo_scale = !false && ((log2_block_width + log2_block_height) % 2 == 1);
-  needs_block_size_trafo_scale |= 1; // Non log2 block size
+  needs_block_size_trafo_scale |= 0; // Non log2 block size
 
   int32_t  transform_shift   = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_block_width + log2_block_height) >> 1) + needs_block_size_trafo_scale;  // Represents scaling through forward transform
   uint16_t go_rice_param     = 0;
diff --git a/src/search.c b/src/search.c
index 090f5f16..5166e47f 100644
--- a/src/search.c
+++ b/src/search.c
@@ -1335,7 +1335,7 @@ static double search_cu(
 
   // Recursively split all the way to max search depth.
   if (can_split_cu) {
-    const int split_type = depth == 0 ? QT_SPLIT : BT_VER_SPLIT;
+    const int split_type = depth == 0 ? QT_SPLIT : TT_HOR_SPLIT;
     const split_tree_t new_split = {
       split_tree.split_tree | split_type << (split_tree.current_depth * 3),
       split_tree.current_depth + 1,
diff --git a/src/strategies/avx2/quant-avx2.c b/src/strategies/avx2/quant-avx2.c
index 84cdd436..a7e0f2f6 100644
--- a/src/strategies/avx2/quant-avx2.c
+++ b/src/strategies/avx2/quant-avx2.c
@@ -387,7 +387,7 @@ void uvg_quant_avx2(const encoder_state_t * const state, const coeff_t * __restr
   int32_t qp_scaled = uvg_get_scaled_qp(color, state->qp, (encoder->bitdepth - 8) * 6, encoder->qp_map[0]);
   qp_scaled = transform_skip ? MAX(qp_scaled, 4 + 6 * MIN_QP_PRIME_TS) : qp_scaled;
   bool needs_block_size_trafo_scale = !transform_skip && ((log2_tr_height + log2_tr_width) % 2 == 1);
-  needs_block_size_trafo_scale |= 1; // Non log2 block size
+  needs_block_size_trafo_scale |= 0; // Non log2 block size
   
   const int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)color;
   const int32_t *quant_coeff = encoder->scaling_list.quant_coeff[log2_tr_width][log2_tr_height][scalinglist_type][qp_scaled % 6];
@@ -798,7 +798,7 @@ void uvg_dequant_avx2(const encoder_state_t * const state, coeff_t *q_coef, coef
   const uint32_t log2_tr_height = uvg_g_convert_to_log2[height];
   int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_width + log2_tr_height) >> 1);
   bool needs_block_size_trafo_scale = !transform_skip && ((log2_tr_height + log2_tr_width) % 2 == 1);
-  needs_block_size_trafo_scale |= 1; // Non log2 block size// Represents scaling through forward transform
+  needs_block_size_trafo_scale |= 0; // Non log2 block size// Represents scaling through forward transform
 
 
   int32_t qp_scaled = uvg_get_scaled_qp(color, state->qp, (encoder->bitdepth-8)*6, encoder->qp_map[0]);
diff --git a/src/strategies/generic/quant-generic.c b/src/strategies/generic/quant-generic.c
index 7dc110d7..04a668f3 100644
--- a/src/strategies/generic/quant-generic.c
+++ b/src/strategies/generic/quant-generic.c
@@ -69,7 +69,7 @@ void uvg_quant_generic(
   int32_t qp_scaled = uvg_get_scaled_qp(color, state->qp, (encoder->bitdepth - 8) * 6, encoder->qp_map[0]);
   qp_scaled = transform_skip ? MAX(qp_scaled, 4 + 6 * MIN_QP_PRIME_TS) : qp_scaled;
   bool needs_block_size_trafo_scale = !transform_skip && ((log2_tr_height + log2_tr_width) % 2 == 1);
-  needs_block_size_trafo_scale |= 1; // Non log2 block size
+  needs_block_size_trafo_scale |= 0; // Non log2 block size
     
   const int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)color;
   const int32_t *quant_coeff = encoder->scaling_list.quant_coeff[log2_tr_width][log2_tr_height][scalinglist_type][qp_scaled % 6];
@@ -598,7 +598,7 @@ void uvg_dequant_generic(const encoder_state_t * const state, coeff_t *q_coef, c
   int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_width + log2_tr_height) >> 1); // Represents scaling through forward transform
 
   bool needs_block_size_trafo_scale = !transform_skip && ((log2_tr_height + log2_tr_width) % 2 == 1);
-  needs_block_size_trafo_scale |= 1; // Non log2 block size
+  needs_block_size_trafo_scale |= 0; // Non log2 block size
 
   int32_t qp_scaled = uvg_get_scaled_qp(color, state->qp, (encoder->bitdepth-8)*6, encoder->qp_map[0]);
   qp_scaled = transform_skip ? MAX(qp_scaled, 4 + 6 * MIN_QP_PRIME_TS) : qp_scaled;
diff --git a/src/transform.c b/src/transform.c
index 0169a0ff..4734b672 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -34,6 +34,7 @@
 
 #include "encode_coding_tree.h"
 #include "image.h"
+#include "intra.h"
 #include "uvg266.h"
 #include "lfnst_tables.h"
 #include "rdo.h"
@@ -184,8 +185,9 @@ void uvg_derive_lfnst_constraints(
   coeff_scan_order_t scan_idx = SCAN_DIAG;
   // ToDo: large block support in VVC?
 
-  const uint32_t log2_block_size = uvg_g_convert_to_log2[width];
-  const uint32_t* scan = uvg_g_sig_last_scan[scan_idx][log2_block_size - 1];
+  const uint32_t log2_tr_width = uvg_g_convert_to_log2[width];
+  const uint32_t log2_tr_height = uvg_g_convert_to_log2[height];
+  const uint32_t* scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_idx, log2_tr_width, log2_tr_height);
 
   signed scan_pos_last = -1;
   coeff_t temp[TR_MAX_WIDTH * TR_MAX_WIDTH];
@@ -801,7 +803,27 @@ void uvg_fwd_lfnst_NxN(coeff_t *src, coeff_t *dst, const int8_t mode, const int8
   }
 }
 
-static inline bool get_transpose_flag(const int8_t intra_mode)
+static uint32_t get_lfnst_intra_mode(int mode)
+{
+  uint32_t intraMode;
+
+  if (mode < 0)
+  {
+    intraMode = (uint32_t)(mode + (NUM_EXT_LUMA_MODE >> 1) + NUM_LUMA_MODE);
+  }
+  else if (mode >= NUM_LUMA_MODE)
+  {
+    intraMode = (uint32_t)(mode + (NUM_EXT_LUMA_MODE >> 1));
+  }
+  else
+  {
+    intraMode = (uint32_t)mode;
+  }
+
+  return intraMode;
+}
+
+static bool get_transpose_flag(const int8_t intra_mode)
 {
   return ((intra_mode >= NUM_LUMA_MODE) && (intra_mode >= (NUM_LUMA_MODE + (NUM_EXT_LUMA_MODE >> 1)))) ||
          ((intra_mode < NUM_LUMA_MODE) && (intra_mode > DIA_IDX));
@@ -837,22 +859,22 @@ void uvg_fwd_lfnst(
   enum uvg_tree_type tree_type)
 {
   const uint16_t lfnst_index = lfnst_idx;
+  const uint32_t log2_width = uvg_g_convert_to_log2[width];
+  const uint32_t log2_height = uvg_g_convert_to_log2[height];
   int8_t intra_mode = (color == COLOR_Y) ? cur_cu->intra.mode : cur_cu->intra.mode_chroma;
   bool mts_skip = cur_cu->tr_idx == MTS_SKIP;
   bool is_separate_tree = cur_cu->log2_height + cur_cu->log2_width < 6 || tree_type != UVG_BOTH_T;
   bool is_cclm_mode = (intra_mode >= 81 && intra_mode <= 83); // CCLM modes are in [81, 83]
 
   bool is_mip = block_is_mip(cur_cu, color, is_separate_tree);
-  bool is_wide_angle = false; // TODO: get wide angle mode when implemented
   
   const int scan_order = SCAN_DIAG;
 
   if (lfnst_index && !mts_skip && (is_separate_tree || color == COLOR_Y))
   {
-    const uint32_t log2_block_size = uvg_g_convert_to_log2[width];
-    assert(log2_block_size != -1 && "LFNST: invalid block width.");
+    assert(log2_width != -1 && "LFNST: invalid block width.");
     const bool whge3 = width >= 8 && height >= 8;
-    const uint32_t* scan = whge3 ? uvg_coef_top_left_diag_scan_8x8[log2_block_size] : uvg_g_sig_last_scan[scan_order][log2_block_size - 1];
+    const uint32_t* scan = whge3 ? uvg_coef_top_left_diag_scan_8x8[log2_width] : uvg_g_sig_last_scan[scan_order][log2_width - 1];
 
     if (is_cclm_mode) {
       intra_mode = cur_cu->intra.mode;
@@ -862,11 +884,11 @@ void uvg_fwd_lfnst(
     }
     assert(intra_mode < NUM_INTRA_MODE && "LFNST: Invalid intra mode.");
     assert(lfnst_index < 3 && "LFNST: Invalid LFNST index. Must be in [0, 2]");
-
-    if (is_wide_angle) {
-      // Transform wide angle mode to intra mode
-      intra_mode = intra_mode; // TODO: wide angle modes not implemented yet. Do nothing.
-    }
+    int32_t wide_adjusted_mode = uvg_wide_angle_correction(intra_mode, cur_cu->intra.isp_mode != 0, log2_width, log2_height);
+    
+    // Transform wide angle mode to intra mode
+    intra_mode = get_lfnst_intra_mode(wide_adjusted_mode);
+  
 
     bool transpose = get_transpose_flag(intra_mode);
     const int sb_size = whge3 ? 8 : 4;
@@ -971,20 +993,19 @@ void uvg_inv_lfnst(
   // Such is not yet present in uvg266 so use 15 for now
   const int max_log2_dyn_range = 15;
   const uint32_t  lfnst_index = lfnst_idx;
+  const uint32_t log2_width = uvg_g_convert_to_log2[width];
+  const uint32_t log2_height = uvg_g_convert_to_log2[height];
   int8_t intra_mode = (color == COLOR_Y) ? cur_cu->intra.mode : cur_cu->intra.mode_chroma;
   bool mts_skip = cur_cu->tr_idx == MTS_SKIP;
   bool is_separate_tree = cur_cu->log2_height + cur_cu->log2_width < 6 || tree_type != UVG_BOTH_T;
   bool is_cclm_mode = (intra_mode >= 81 && intra_mode <= 83); // CCLM modes are in [81, 83]
 
   bool is_mip = block_is_mip(cur_cu, color, is_separate_tree);
-  bool is_wide_angle = false; // TODO: get wide angle mode when implemented
-  
   const int scan_order = SCAN_DIAG;
   
   if (lfnst_index && !mts_skip && (is_separate_tree || color == COLOR_Y)) {
-    const uint32_t log2_block_size = uvg_g_convert_to_log2[width];
     const bool whge3 = width >= 8 && height >= 8;
-    const uint32_t* scan = whge3 ? uvg_coef_top_left_diag_scan_8x8[log2_block_size] : uvg_g_sig_last_scan[scan_order][log2_block_size - 1];
+    const uint32_t* scan = whge3 ? uvg_coef_top_left_diag_scan_8x8[log2_width] : uvg_g_sig_last_scan[scan_order][log2_width - 1];
     
     if (is_cclm_mode) {
       intra_mode = cur_cu->intra.mip_flag ? 0 : cur_cu->intra.mode;
@@ -994,12 +1015,11 @@ void uvg_inv_lfnst(
     }
     assert(intra_mode < NUM_INTRA_MODE && "LFNST: Invalid intra mode.");
     assert(lfnst_index < 3 && "LFNST: Invalid LFNST index. Must be in [0, 2]");
+    int32_t wide_adjusted_mode = uvg_wide_angle_correction(intra_mode, cur_cu->intra.isp_mode != 0, log2_width, log2_height);
 
-    if (is_wide_angle) {
-      // Transform wide angle mode to intra mode
-      intra_mode = intra_mode; // TODO: wide angle modes not implemented yet. Do nothing.
-    }
-
+    
+    intra_mode = get_lfnst_intra_mode(wide_adjusted_mode); 
+    
     bool          transpose_flag = get_transpose_flag(intra_mode);
     const int     sb_size = whge3 ? 8 : 4;
     bool          tu_4x4_flag = (width == 4 && height == 4);

From 26ee443d2fd9c60de9cae6bb4ce3edc6a0588867 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Mon, 21 Nov 2022 13:45:36 +0200
Subject: [PATCH 121/254] [mtt] 64x32 and 32x64

---
 src/cu.c                 |  4 ++--
 src/encode_coding_tree.c | 30 +++++++++++++++++-----------
 src/intra.c              | 43 ++++++++++++++++------------------------
 src/search_intra.c       | 31 ++++++++++++++---------------
 src/transform.c          | 32 ++++++++++++++++--------------
 5 files changed, 69 insertions(+), 71 deletions(-)

diff --git a/src/cu.c b/src/cu.c
index 8c806733..e84526bd 100644
--- a/src/cu.c
+++ b/src/cu.c
@@ -372,10 +372,10 @@ int uvg_count_available_edge_cus(const cu_loc_t* const cu_loc, const lcu_t* cons
     while (LCU_GET_CU_AT_PX(lcu, cu_loc->local_x - TR_MIN_WIDTH, cu_loc->local_y + amount)->type != CU_NOTSET && (cu_loc->local_y + amount) < LCU_WIDTH) {
       amount += TR_MIN_WIDTH;
     }
-    return amount / TR_MIN_WIDTH;
+    return MAX(amount / TR_MIN_WIDTH, cu_loc->height / TR_MIN_WIDTH);
   }
   while (LCU_GET_CU_AT_PX(lcu, cu_loc->local_x + amount, cu_loc->local_y - TR_MIN_WIDTH)->type != CU_NOTSET && cu_loc->local_x + amount < LCU_WIDTH) {
     amount += TR_MIN_WIDTH;
   }
-  return amount / TR_MIN_WIDTH;
+  return MAX(amount / TR_MIN_WIDTH, cu_loc->width / TR_MIN_WIDTH);
 }
\ No newline at end of file
diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index bacaf38b..f93b6cf7 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -625,25 +625,31 @@ static void encode_transform_coeff(
   const int x_cu = 8 * (x / 8);
   const int y_cu = 8 * (y / 8);
   const cu_info_t *cur_cu = uvg_cu_array_at_const(used_array, x, y); // TODO: very suspect, chroma cbfs stored in upper left corner, everything else in bottom right for depth 4
-  
-  int8_t split = (cu_loc->width > TR_MAX_WIDTH || cu_loc->height > TR_MAX_WIDTH);
+
+  const bool ver_split = cu_loc->height > TR_MAX_WIDTH;
+  const bool hor_split = cu_loc->width > TR_MAX_WIDTH;
 
   const int cb_flag_y = tree_type != UVG_CHROMA_T ? cbf_is_set(cur_pu->cbf, COLOR_Y) : 0;
   const int cb_flag_u = tree_type != UVG_LUMA_T ?( cur_pu->joint_cb_cr ? (cur_pu->joint_cb_cr >> 1) & 1 : cbf_is_set(cur_cu->cbf, COLOR_U)) : 0;
   const int cb_flag_v = tree_type != UVG_LUMA_T ? (cur_pu->joint_cb_cr ? cur_pu->joint_cb_cr & 1 : cbf_is_set(cur_cu->cbf, COLOR_V)) : 0;
 
 
-  if (split) {
-    int split_width  = width >> 1;
-    int split_height = height >> 1;
+  if (hor_split || ver_split) {
+    enum split_type split;
+    if (cu_loc->width > TR_MAX_WIDTH && cu_loc->height > TR_MAX_WIDTH) {
+      split = QT_SPLIT;
+    }
+    else if (cu_loc->width > TR_MAX_WIDTH) {
+      split = BT_VER_SPLIT;
+    }
+    else {
+      split = BT_HOR_SPLIT;
+    }
 
-    for (int j = 0; j < 2; j++) {
-      for (int i = 0; i < 2; i++) {
-        cu_loc_t loc;
-        uvg_cu_loc_ctor(&loc, (x + i * split_width), (y + j * split_height), width >> 1, height >> 1);
-
-        encode_transform_coeff(state, &loc, only_chroma, coeff, tree_type, true, luma_cbf_ctx, &loc);
-      }
+    cu_loc_t split_cu_loc[4];
+    const int split_count = uvg_get_split_locs(cu_loc, split, split_cu_loc);
+    for (int i = 0; i < split_count; ++i) {
+      encode_transform_coeff(state, &split_cu_loc[i], only_chroma, coeff, tree_type, true, luma_cbf_ctx, &split_cu_loc[i]);
     }
     return;
   }
diff --git a/src/intra.c b/src/intra.c
index 2c387a64..398ebc39 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -565,7 +565,7 @@ static void predict_cclm(
       y_extension >>= tree_type == UVG_CHROMA_T;
       const cu_info_t* pu = LCU_GET_CU_AT_PX(lcu, (x_scu >> (tree_type == UVG_CHROMA_T)) - 4, y_extension);
       if (y_extension >= ctu_size || pu->type == CU_NOTSET || (pu->type == CU_INTRA && pu->intra.mode_chroma == -1)) break;
-      if(x_scu == 32 && y_scu == 0 && pu->log2_width == 6) break;
+      if(x_scu == 32 && y_scu == 0 && pu->log2_height == 6 && pu->log2_width == 6 ) break;
     }
     for(int i = 0; i < height + available_left_below * 2; i++) {
       sampled_luma_ref.left[i] = state->tile->frame->cclm_luma_rec[(y0/2 + i) * (stride2/2) + x0 / 2 - 1];
@@ -1783,6 +1783,7 @@ static void intra_recon_tb_leaf(
   }
 }
 
+
 /**
  * \brief Reconstruct an intra CU
  *
@@ -1833,33 +1834,23 @@ void uvg_intra_recon_cu(
   }
 
   if (width > TR_MAX_WIDTH || height > TR_MAX_WIDTH) {
-    cu_loc_t split_cu_loc;
+    enum split_type split;
+    if (cu_loc->width > TR_MAX_WIDTH && cu_loc->height > TR_MAX_WIDTH) {
+      split = QT_SPLIT;
+    }
+    else if (cu_loc->width > TR_MAX_WIDTH) {
+      split = BT_VER_SPLIT;
+    }
+    else {
+      split = BT_HOR_SPLIT;
+    }
 
-    const int half_width = width / 2;
-    const int half_height = height / 2;
-    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x, cu_loc->y, half_width, half_height);
-    uvg_intra_recon_cu(state, search_data, &split_cu_loc, NULL, lcu, tree_type, recon_luma, recon_chroma);
-    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x + half_width, cu_loc->y, half_width, half_height);
-    uvg_intra_recon_cu(state, search_data, &split_cu_loc, NULL, lcu, tree_type, recon_luma, recon_chroma);
-    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x, cu_loc->y + half_height, half_width, half_height);
-    uvg_intra_recon_cu(state, search_data, &split_cu_loc, NULL, lcu, tree_type, recon_luma, recon_chroma);
-    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x + half_width, cu_loc->y + half_height, half_width, half_height);
-    uvg_intra_recon_cu(state, search_data, &split_cu_loc, NULL, lcu, tree_type, recon_luma, recon_chroma);
+    cu_loc_t split_cu_loc[4];
+    const int split_count = uvg_get_split_locs(cu_loc, split, split_cu_loc);
+    for (int i = 0; i < split_count; ++i) {
+      uvg_intra_recon_cu(state, search_data, &split_cu_loc[i], NULL, lcu, tree_type, recon_luma, recon_chroma);
+    }
 
-    // Propagate coded block flags from child CUs to parent CU.
-    uint16_t child_cbfs[3] = {
-      LCU_GET_CU_AT_PX(lcu, (lcu_px.x + half_width) >> (tree_type == UVG_CHROMA_T), lcu_px.y >> (tree_type == UVG_CHROMA_T))->cbf,
-      LCU_GET_CU_AT_PX(lcu, lcu_px.x >> (tree_type == UVG_CHROMA_T), (lcu_px.y + half_height) >> (tree_type == UVG_CHROMA_T))->cbf,
-      LCU_GET_CU_AT_PX(lcu, (lcu_px.x + half_width) >> (tree_type == UVG_CHROMA_T), (lcu_px.y + half_height) >> (tree_type == UVG_CHROMA_T))->cbf,
-    };
-
-    //if (recon_luma && depth <= MAX_DEPTH) {
-    //  cbf_set_conditionally(&cur_cu->cbf, child_cbfs, depth, COLOR_Y);
-    //}
-    //if (recon_chroma && depth <= MAX_DEPTH) {
-    //  cbf_set_conditionally(&cur_cu->cbf, child_cbfs, depth, COLOR_U);
-    //  cbf_set_conditionally(&cur_cu->cbf, child_cbfs, depth, COLOR_V);
-    //}
     return;
   }
   if (search_data->pred_cu.intra.isp_mode != ISP_MODE_NO_ISP && recon_luma ) {
diff --git a/src/search_intra.c b/src/search_intra.c
index 792bc1fc..07826cec 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -575,25 +575,24 @@ static double search_intra_trdepth(
   //     max_depth.
   // - Min transform size hasn't been reached (MAX_PU_DEPTH).
   else {
-    cu_loc_t split_cu_loc;
-
-    const int half_width = width / 2;
-    const int half_height = height / 2;
     split_cost = 0;
 
-    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x, cu_loc->y, half_width, half_height);
-    split_cost += search_intra_trdepth(state, &split_cu_loc, nosplit_cost, search_data, lcu, tree_type);
-    if (split_cost < nosplit_cost) {
-      uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x + half_width, cu_loc->y, half_width, half_height);
-      split_cost += search_intra_trdepth(state, &split_cu_loc, nosplit_cost, search_data, lcu, tree_type);
+
+    enum split_type split;
+    if (cu_loc->width > TR_MAX_WIDTH && cu_loc->height > TR_MAX_WIDTH) {
+      split = QT_SPLIT;
     }
-    if (split_cost < nosplit_cost) {
-      uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x, cu_loc->y + half_height, half_width, half_height);
-      split_cost += search_intra_trdepth(state, &split_cu_loc, nosplit_cost, search_data, lcu, tree_type);
+    else if (cu_loc->width > TR_MAX_WIDTH) {
+      split = BT_VER_SPLIT;
     }
-    if (split_cost < nosplit_cost) {
-      uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x + half_width, cu_loc->y + half_height, half_width, half_height);
-      split_cost += search_intra_trdepth(state, &split_cu_loc, nosplit_cost, search_data, lcu, tree_type);
+    else {
+      split = BT_HOR_SPLIT;
+    }
+
+    cu_loc_t split_cu_loc[4];
+    const int split_count = uvg_get_split_locs(cu_loc, split, split_cu_loc);
+    for (int i = 0; i < split_count; ++i) {
+      split_cost += search_intra_trdepth(state, &split_cu_loc[i], nosplit_cost, search_data, lcu, tree_type);
     }
   }
 
@@ -1821,7 +1820,7 @@ void uvg_search_cu_intra(
   }
 
   uint8_t num_mrl_modes = 0;
-  for(int line = 1; line < lines; ++line) {
+  for(int line = 1; line < lines && !is_large; ++line) {
     uvg_pixel extra_refs[128 * MAX_REF_LINE_IDX] = { 0 };
 
     if (luma_px.x > 0 && lcu_px.x == 0 && lcu_px.y > 0) {
diff --git a/src/transform.c b/src/transform.c
index 4734b672..968ae440 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -1400,25 +1400,27 @@ void uvg_quantize_lcu_residual(
   }
 
   if (cu_loc->width > TR_MAX_WIDTH || cu_loc->height > TR_MAX_WIDTH) {
+    enum split_type split;
+    if (cu_loc->width > TR_MAX_WIDTH && cu_loc->height > TR_MAX_WIDTH) {
+      split = QT_SPLIT;
+    }
+    else if (cu_loc->width > TR_MAX_WIDTH) {
+      split = BT_VER_SPLIT;
+    }
+    else {
+      split = BT_HOR_SPLIT;
+    }
 
-    // Split transform and increase depth
-    const int offset = width / 2;
-    for (int j = 0; j < 2; ++j) {
-      for (int i = 0; i < 2; ++i) {
-        cu_loc_t loc;
-        uvg_cu_loc_ctor(&loc, (x + i * offset), (y + j * offset), width >> 1, height >> 1);
-        // jccr is currently not supported if transform is split
-        uvg_quantize_lcu_residual(state, luma, chroma, 0, &loc, NULL, lcu, early_skip, tree_type);
+    cu_loc_t split_cu_loc[4];
+    uint16_t child_cbfs[3];
+    const int split_count = uvg_get_split_locs(cu_loc, split, split_cu_loc);
+    for (int i = 0; i < split_count; ++i) {
+      uvg_quantize_lcu_residual(state, luma, chroma, 0, &split_cu_loc[i], NULL, lcu, early_skip, tree_type);
+      if(i != 0) {
+        child_cbfs[i - 1] = LCU_GET_CU_AT_PX(lcu, split_cu_loc[i].local_x, split_cu_loc[i].local_y)->cbf;
       }
     }
-    
 
-    // Propagate coded block flags from child CUs to parent CU.
-    uint16_t child_cbfs[3] = {
-      LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y         )->cbf,
-      LCU_GET_CU_AT_PX(lcu, lcu_px.x,          lcu_px.y + offset)->cbf,
-      LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y + offset)->cbf,
-    };
     
     cur_pu->root_cbf = cbf_is_set_any(cur_pu->cbf)
       || cbf_is_set_any(child_cbfs[0])

From d257376ca01e392d94dad27dfa6b9b59d97f41e7 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Thu, 24 Nov 2022 09:04:42 +0200
Subject: [PATCH 122/254] [mtt] Single mtt split works for everything else,
 except 16x16 with TT

---
 src/cu.c                               |  12 +-
 src/cu.h                               |   3 +-
 src/encode_coding_tree.c               | 101 +++++++-------
 src/encode_coding_tree.h               |   4 +-
 src/encoder_state-bitstream.c          |   2 +-
 src/encoderstate.c                     |   4 +-
 src/global.h                           |   4 +-
 src/intra.c                            |  19 ++-
 src/intra.h                            |   3 +-
 src/search.c                           | 181 +++++++++++++------------
 src/search_intra.c                     |  20 +--
 src/search_intra.h                     |   3 +-
 src/strategies/generic/dct-generic.c   |   4 +-
 src/strategies/generic/intra-generic.c |   5 +-
 src/transform.c                        |  15 +-
 15 files changed, 202 insertions(+), 178 deletions(-)

diff --git a/src/cu.c b/src/cu.c
index e84526bd..b0cb2a63 100644
--- a/src/cu.c
+++ b/src/cu.c
@@ -312,15 +312,16 @@ void uvg_cu_loc_ctor(cu_loc_t* loc, int x, int y, int width, int height)
   loc->height = height;
   // TODO: when MTT is implemented, chroma dimensions can be minimum 2.
   // Chroma width is half of luma width, when not at maximum depth.
-  loc->chroma_width = MAX(width >> 1, 4);
-  loc->chroma_height = MAX(height >> 1, 4);
+  loc->chroma_width = width >> 1;
+  loc->chroma_height = height >> 1;
 }
 
 
 int uvg_get_split_locs(
   const cu_loc_t* const origin,
   enum split_type split,
-  cu_loc_t out[4])
+  cu_loc_t out[4],
+  uint8_t* separate_chroma)
 {
   const int half_width = origin->width >> 1;
   const int half_height = origin->height >> 1;
@@ -336,24 +337,29 @@ int uvg_get_split_locs(
       uvg_cu_loc_ctor(&out[1], origin->x + half_width, origin->y, half_width, half_height);
       uvg_cu_loc_ctor(&out[2], origin->x, origin->y + half_height, half_width, half_height);
       uvg_cu_loc_ctor(&out[3], origin->x + half_width, origin->y + half_height, half_width, half_height);
+      if (half_height == 4 && separate_chroma) *separate_chroma = 1;
       return 4;
     case BT_HOR_SPLIT:
       uvg_cu_loc_ctor(&out[0], origin->x, origin->y, origin->width, half_height);
       uvg_cu_loc_ctor(&out[1], origin->x, origin->y + half_height, origin->width, half_height);
+      if (half_height * origin->width < 64 && separate_chroma) *separate_chroma = 1;
       return 2;
     case BT_VER_SPLIT:
       uvg_cu_loc_ctor(&out[0], origin->x, origin->y, half_width, origin->height);
       uvg_cu_loc_ctor(&out[1], origin->x + half_width, origin->y, half_width, origin->height);
+      if (half_width == 4 && separate_chroma) *separate_chroma = 1;
       return 2;
     case TT_HOR_SPLIT:
       uvg_cu_loc_ctor(&out[0], origin->x, origin->y, origin->width, quarter_height);
       uvg_cu_loc_ctor(&out[1], origin->x, origin->y + quarter_height, origin->width, half_height);
       uvg_cu_loc_ctor(&out[2], origin->x, origin->y + quarter_height + half_height, origin->width, quarter_height);
+      if (quarter_height * origin->width < 64 && separate_chroma) *separate_chroma = 1;
       return 3;
     case TT_VER_SPLIT:
       uvg_cu_loc_ctor(&out[0], origin->x, origin->y, quarter_width, origin->height);
       uvg_cu_loc_ctor(&out[1], origin->x + quarter_width, origin->y, half_width, origin->height);
       uvg_cu_loc_ctor(&out[2], origin->x + quarter_width + half_width, origin->y, quarter_width, origin->height);
+      if (quarter_width == 4 && separate_chroma) *separate_chroma = 1;
       return 3;
   }
   return 0;
diff --git a/src/cu.h b/src/cu.h
index 6440f6f2..48a021c3 100644
--- a/src/cu.h
+++ b/src/cu.h
@@ -189,7 +189,8 @@ void uvg_cu_loc_ctor(cu_loc_t *loc, int x, int y, int width, int height);
 int uvg_get_split_locs(
   const cu_loc_t* const origin,
   enum split_type split,
-  cu_loc_t out[4]);
+  cu_loc_t out[4],
+  uint8_t* separate_chroma);
 
 
 #define CU_GET_MV_CAND(cu_info_ptr, reflist) \
diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index f93b6cf7..c908449d 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -466,7 +466,6 @@ static void encode_chroma_tu(
     uvg_get_sub_coeff(coeff_v, coeff->v, x_local, y_local, cu_loc->chroma_width, cu_loc->chroma_height, LCU_WIDTH_C);
 
     if (cbf_is_set(cur_pu->cbf, COLOR_U)) {
-      // TODO: height for this check and the others below
       if(state->encoder_control->cfg.trskip_enable 
         && width_c <= (1 << state->encoder_control->cfg.trskip_max_size)
         && height_c <= (1 << state->encoder_control->cfg.trskip_max_size)){
@@ -505,8 +504,9 @@ static void encode_chroma_tu(
 static void encode_transform_unit(
   encoder_state_t * const state,
   const cu_loc_t *cu_loc,
-  bool only_chroma,
+  const cu_info_t* cur_pu,
   lcu_coeff_t* coeff,
+  bool only_chroma,
   enum uvg_tree_type tree_type,
   bool last_split,
   const cu_loc_t *original_loc)               // Original cu dimensions, before CU split
@@ -524,7 +524,9 @@ static void encode_transform_unit(
   int isp_x = x;
   int isp_y = y;
   uvg_get_isp_cu_arr_coords(&isp_x, &isp_y);
-  const cu_info_t *cur_pu = uvg_cu_array_at_const(used_cu_array, isp_x, isp_y);
+  if(cur_pu == NULL) {
+    cur_pu = uvg_cu_array_at_const(used_cu_array, isp_x, isp_y);
+  }
 
   int8_t scan_idx = SCAN_DIAG;
 
@@ -540,7 +542,10 @@ static void encode_transform_unit(
     // CoeffNxN
     // Residual Coding
 
-    if(state->encoder_control->cfg.trskip_enable && width <= (1 << state->encoder_control->cfg.trskip_max_size) && !(cur_pu->type == CU_INTRA && cur_pu->intra.isp_mode != ISP_MODE_NO_ISP)) {
+    if(state->encoder_control->cfg.trskip_enable 
+      && width <= (1 << state->encoder_control->cfg.trskip_max_size) 
+      && height <= (1 << state->encoder_control->cfg.trskip_max_size)
+      && !(cur_pu->type == CU_INTRA && cur_pu->intra.isp_mode != ISP_MODE_NO_ISP)) {
       cabac->cur_ctx = &cabac->ctx.transform_skip_model_luma;
       CABAC_BIN(cabac, cur_pu->tr_idx == MTS_SKIP, "transform_skip_flag");
       DBG_YUVIEW_VALUE(state->frame->poc, DBG_YUVIEW_TR_SKIP, x, y, width, height, (cur_pu->tr_idx == MTS_SKIP) ? 1 : 0);
@@ -561,7 +566,7 @@ static void encode_transform_unit(
   }
 
   bool joint_chroma = cur_pu->joint_cb_cr != 0;
-  if (cur_pu->log2_height + cur_pu->log2_width < 6 && tree_type != UVG_CHROMA_T) {
+  if (cur_pu->log2_height + cur_pu->log2_width < 6 && tree_type != UVG_CHROMA_T && !only_chroma) {
     // For size 4x4 luma transform the corresponding chroma transforms are
     // also of size 4x4 covering 8x8 luma pixels. The residual is coded in
     // the last transform unit.
@@ -597,6 +602,7 @@ static void encode_transform_coeff(
   const cu_loc_t * cu_loc,
   bool only_chroma,
   lcu_coeff_t* coeff,
+  const cu_info_t* cur_tu,
   enum uvg_tree_type tree_type,
   bool last_split,
   bool can_skip_last_cbf,
@@ -604,10 +610,6 @@ static void encode_transform_coeff(
   const cu_loc_t * const original_loc)       // Original dimensions before ISP split
 {
   cabac_data_t * const cabac = &state->cabac;
-  int x = cu_loc->x;
-  int y = cu_loc->y;
-  const int width = cu_loc->width;
-  const int height = cu_loc->height;
 
   bool isp_split = cu_loc->x != original_loc->x || cu_loc->y != original_loc->y;
 
@@ -618,20 +620,16 @@ static void encode_transform_coeff(
   //const encoder_control_t *const ctrl = state->encoder_control;
   const videoframe_t * const frame = state->tile->frame;
   const cu_array_t* used_array = tree_type != UVG_CHROMA_T ? frame->cu_array : frame->chroma_cu_array;
-
-  const cu_info_t *cur_pu = uvg_cu_array_at_const(used_array, x, y);
-  // Round coordinates down to a multiple of 8 to get the location of the
-  // containing CU.
-  const int x_cu = 8 * (x / 8);
-  const int y_cu = 8 * (y / 8);
-  const cu_info_t *cur_cu = uvg_cu_array_at_const(used_array, x, y); // TODO: very suspect, chroma cbfs stored in upper left corner, everything else in bottom right for depth 4
+  if(cur_tu == NULL) {
+    cur_tu = uvg_cu_array_at_const(used_array, cu_loc->x, cu_loc->y);
+  }
 
   const bool ver_split = cu_loc->height > TR_MAX_WIDTH;
   const bool hor_split = cu_loc->width > TR_MAX_WIDTH;
 
-  const int cb_flag_y = tree_type != UVG_CHROMA_T ? cbf_is_set(cur_pu->cbf, COLOR_Y) : 0;
-  const int cb_flag_u = tree_type != UVG_LUMA_T ?( cur_pu->joint_cb_cr ? (cur_pu->joint_cb_cr >> 1) & 1 : cbf_is_set(cur_cu->cbf, COLOR_U)) : 0;
-  const int cb_flag_v = tree_type != UVG_LUMA_T ? (cur_pu->joint_cb_cr ? cur_pu->joint_cb_cr & 1 : cbf_is_set(cur_cu->cbf, COLOR_V)) : 0;
+  const int cb_flag_y = tree_type != UVG_CHROMA_T ? cbf_is_set(cur_tu->cbf, COLOR_Y) : 0;
+  const int cb_flag_u = tree_type != UVG_LUMA_T ?(cur_tu->joint_cb_cr ? (cur_tu->joint_cb_cr >> 1) & 1 : cbf_is_set(cur_tu->cbf, COLOR_U)) : 0;
+  const int cb_flag_v = tree_type != UVG_LUMA_T ? (cur_tu->joint_cb_cr ? cur_tu->joint_cb_cr & 1 : cbf_is_set(cur_tu->cbf, COLOR_V)) : 0;
 
 
   if (hor_split || ver_split) {
@@ -647,9 +645,9 @@ static void encode_transform_coeff(
     }
 
     cu_loc_t split_cu_loc[4];
-    const int split_count = uvg_get_split_locs(cu_loc, split, split_cu_loc);
+    const int split_count = uvg_get_split_locs(cu_loc, split, split_cu_loc,NULL);
     for (int i = 0; i < split_count; ++i) {
-      encode_transform_coeff(state, &split_cu_loc[i], only_chroma, coeff, tree_type, true, luma_cbf_ctx, &split_cu_loc[i]);
+      encode_transform_coeff(state, &split_cu_loc[i], only_chroma, coeff, NULL, tree_type, true, luma_cbf_ctx, &split_cu_loc[i]);
     }
     return;
   }
@@ -658,7 +656,7 @@ static void encode_transform_coeff(
   // Not the last CU for area of 64 pixels cowered by more than one luma CU.
   // Not the last ISP Split
   if (state->encoder_control->chroma_format != UVG_CSP_400 
-    && (cur_pu->log2_height + cur_pu->log2_width >= 6 || only_chroma)
+    && (cur_tu->log2_height + cur_tu->log2_width >= 6 || only_chroma)
     && tree_type != UVG_LUMA_T 
     && last_split) {
     cabac->cur_ctx = &(cabac->ctx.qt_cbf_model_cb[0]);
@@ -684,22 +682,22 @@ static void encode_transform_coeff(
   // - transform depth > 0
   // - we have chroma coefficients at this level
   // When it is not present, it is inferred to be 1.
-  if ((cur_cu->type == CU_INTRA || !PU_IS_TU(cur_cu) || cb_flag_u || cb_flag_v) && !only_chroma && tree_type != UVG_CHROMA_T) {
+  if ((cur_tu->type == CU_INTRA || !PU_IS_TU(cur_tu) || cb_flag_u || cb_flag_v) && !only_chroma && tree_type != UVG_CHROMA_T) {
     if (can_skip_last_cbf && isp_split && last_split) {
       // Do not write luma cbf if first three isp splits have luma cbf 0
     } else {
       cabac->cur_ctx = &(cabac->ctx.qt_cbf_model_luma[*luma_cbf_ctx]);
       CABAC_BIN(cabac, cb_flag_y, "cbf_luma");
-      if (PU_IS_TU(cur_cu)) {
+      if (PU_IS_TU(cur_tu)) {
         *luma_cbf_ctx = 2 + cb_flag_y;
       }
     }
   }
 
   if (cb_flag_y | cb_flag_u | cb_flag_v) {
-    if (state->must_code_qp_delta && (only_chroma || cb_flag_y || cur_pu->log2_height + cur_pu->log2_width >= 6) ) {
-      const int qp_pred      = uvg_get_cu_ref_qp(state, x_cu, y_cu, state->last_qp);
-      const int qp_delta     = cur_cu->qp - qp_pred;
+    if (state->must_code_qp_delta && (only_chroma || cb_flag_y || cur_tu->log2_height + cur_tu->log2_width >= 6) ) {
+      const int qp_pred      = uvg_get_cu_ref_qp(state, cu_loc->x, cu_loc->y, state->last_qp);
+      const int qp_delta     = cur_tu->qp - qp_pred;
       // Possible deltaQP range depends on bit depth as stated in HEVC specification.
       assert(qp_delta >= UVG_QP_DELTA_MIN && qp_delta <= UVG_QP_DELTA_MAX && "QP delta not in valid range.");
 
@@ -722,18 +720,18 @@ static void encode_transform_coeff(
     }
     if((
         ((cb_flag_u || cb_flag_v ) 
-          && cur_cu->type == CU_INTRA)
+          && cur_tu->type == CU_INTRA)
         || (cb_flag_u && cb_flag_v)) 
-      && (cur_pu->log2_height + cur_pu->log2_width >= 6 || only_chroma || tree_type == UVG_CHROMA_T)
+      && (cur_tu->log2_height + cur_tu->log2_width >= 6 || only_chroma || tree_type == UVG_CHROMA_T)
       && state->encoder_control->cfg.jccr
       && last_split
       ) {
-      assert(cur_pu->joint_cb_cr < 4 && "JointCbCr is in search state.");
+      assert(cur_tu->joint_cb_cr < 4 && "JointCbCr is in search state.");
       cabac->cur_ctx = &cabac->ctx.joint_cb_cr[cb_flag_u * 2 + cb_flag_v - 1];
-      CABAC_BIN(cabac, cur_pu->joint_cb_cr != 0, "tu_joint_cbcr_residual_flag");
+      CABAC_BIN(cabac, cur_tu->joint_cb_cr != 0, "tu_joint_cbcr_residual_flag");
     }
 
-    encode_transform_unit(state, cu_loc, only_chroma, coeff, tree_type, last_split, original_loc);
+    encode_transform_unit(state, cu_loc, only_chroma ? cur_tu : NULL, coeff, only_chroma, tree_type, last_split, original_loc);
   }
 }
 
@@ -1261,10 +1259,10 @@ uint8_t uvg_write_split_flag(
     bh_split = bv_split = th_split = tv_split = false;
   }
   else {
-    bv_split &= cu_width <= state->encoder_control->cfg.max_bt_size[slice_type];
-    tv_split &= cu_width <= state->encoder_control->cfg.max_tt_size[slice_type];
-    bh_split &= cu_height <= state->encoder_control->cfg.max_bt_size[slice_type];
-    th_split &= cu_height <= state->encoder_control->cfg.max_tt_size[slice_type];
+    bv_split &= cu_width <= state->encoder_control->cfg.max_bt_size[slice_type] && cu_width > state->encoder_control->cfg.min_qt_size[slice_type];
+    tv_split &= cu_width <= state->encoder_control->cfg.max_tt_size[slice_type] && cu_width > 2 * state->encoder_control->cfg.min_qt_size[slice_type];
+    bh_split &= cu_height <= state->encoder_control->cfg.max_bt_size[slice_type] && cu_height > state->encoder_control->cfg.min_qt_size[slice_type];
+    th_split &= cu_height <= state->encoder_control->cfg.max_tt_size[slice_type] && cu_height > 2 * state->encoder_control->cfg.min_qt_size[slice_type];
   }
 
   bool allow_split = allow_qt | bh_split | bv_split | th_split | tv_split;
@@ -1307,12 +1305,12 @@ uint8_t uvg_write_split_flag(
       unsigned left_qt_depth = 0;
       unsigned top_qt_depth = 0;
       if(left_cu) {
-        while (((left_cu->split_tree >> left_qt_depth) & 7u) == QT_SPLIT) {
+        while (((left_cu->split_tree >> (left_qt_depth * 3)) & 7u) == QT_SPLIT) {
           left_qt_depth++;
         }
       }
       if(above_cu) {
-        while (((above_cu->split_tree >> top_qt_depth) & 7u) == QT_SPLIT) {
+        while (((above_cu->split_tree >> (top_qt_depth * 3)) & 7u) == QT_SPLIT) {
           top_qt_depth++;
         }
       }
@@ -1353,7 +1351,9 @@ void uvg_encode_coding_tree(
   lcu_coeff_t *coeff,
   enum uvg_tree_type tree_type,
   const cu_loc_t* const cu_loc,
-  split_tree_t split_tree)
+  const cu_loc_t* const chroma_loc,
+  split_tree_t split_tree,
+  bool has_chroma)
 {
   cabac_data_t * const cabac = &state->cabac;
   const encoder_control_t * const ctrl = state->encoder_control;
@@ -1410,13 +1410,15 @@ void uvg_encode_coding_tree(
       NULL);
     
     if (split_flag || border) {
-      const int half_luma = cu_loc->width / 2;
       const split_tree_t new_split_tree = { cur_cu->split_tree, split_tree.current_depth + 1, split_tree.mtt_depth + (split_flag != QT_SPLIT)};
 
       cu_loc_t new_cu_loc[4];
-      const int splits = uvg_get_split_locs(cu_loc, split_flag, new_cu_loc);
+      uint8_t separate_chroma = 0;
+      const int splits = uvg_get_split_locs(cu_loc, split_flag, new_cu_loc, &separate_chroma);
       for (int split = 0; split <splits; ++split) {
-        uvg_encode_coding_tree(state, coeff, tree_type, &new_cu_loc[split], new_split_tree);
+        uvg_encode_coding_tree(state, coeff, tree_type,
+          &new_cu_loc[split], separate_chroma ? cu_loc : &new_cu_loc[split],
+          new_split_tree, !separate_chroma || split == splits - 1);
       }
       return;
     }
@@ -1595,7 +1597,7 @@ void uvg_encode_coding_tree(
       // Code (possible) coeffs to bitstream
       if (has_coeffs) {
         int luma_cbf_ctx = 0;
-        encode_transform_coeff(state, cu_loc, 0, coeff, tree_type, true, false, &luma_cbf_ctx, cu_loc);
+        encode_transform_coeff(state, cu_loc, 0, coeff, cur_cu, tree_type, true, false, &luma_cbf_ctx, cu_loc);
       }
 
       encode_mts_idx(state, cabac, cur_cu, cu_loc);
@@ -1629,29 +1631,28 @@ void uvg_encode_coding_tree(
 
         // Check if last split to write chroma
         bool last_split = (i + 1) == split_limit;
-        encode_transform_coeff(state, &split_loc, 0, coeff, tree_type, last_split, can_skip_last_cbf, &luma_cbf_ctx, cu_loc);
-        can_skip_last_cbf &= luma_cbf_ctx == 2;
+        encode_transform_coeff(state, &split_loc, 0, coeff, NULL, tree_type, last_split, can_skip_last_cbf, &luma_cbf_ctx, cu_loc);
       }
     }
 
     if (tree_type != UVG_CHROMA_T) {
-      bool lfnst_written = encode_lfnst_idx(state, cabac, cur_cu, is_local_dual_tree ? UVG_LUMA_T : tree_type, COLOR_Y, cu_loc);
+      encode_lfnst_idx(state, cabac, cur_cu, is_local_dual_tree && state->encoder_control->chroma_format != UVG_CSP_400 ? UVG_LUMA_T : tree_type, COLOR_Y, cu_loc);
     }
     encode_mts_idx(state, cabac, cur_cu, cu_loc);
 
     // For 4x4 the chroma PU/TU is coded after the last 
     if (state->encoder_control->chroma_format != UVG_CSP_400 && 
-      ((depth == 4 && x % 8 && y % 8) || tree_type == UVG_CHROMA_T) &&
+      (has_chroma || tree_type == UVG_CHROMA_T) &&
       tree_type != UVG_LUMA_T)   {
       encode_chroma_intra_cu(cabac, cur_cu, state->encoder_control->cfg.cclm, NULL);
       // LFNST constraints must be reset here. Otherwise the left over values will interfere when calculating new constraints
-      cu_info_t* tmp = uvg_cu_array_at((cu_array_t*)used_array, x, y);
+      cu_info_t* tmp = (cu_info_t*)cur_cu;
       tmp->violates_lfnst_constrained_luma = false;
       tmp->violates_lfnst_constrained_chroma = false;
       tmp->lfnst_last_scan_pos = false;
-      encode_transform_coeff(state, cu_loc, 1, coeff, tree_type, true, false, &luma_cbf_ctx, cu_loc);
+      encode_transform_coeff(state, chroma_loc, 1, coeff, cur_cu, tree_type, true, false, &luma_cbf_ctx, chroma_loc);
       // Write LFNST only once for single tree structure
-      encode_lfnst_idx(state, cabac, tmp, is_local_dual_tree ? UVG_CHROMA_T : tree_type, COLOR_UV, cu_loc);
+      encode_lfnst_idx(state, cabac, tmp, is_local_dual_tree ? UVG_CHROMA_T : tree_type, COLOR_UV, chroma_loc);
     }
   }
 
diff --git a/src/encode_coding_tree.h b/src/encode_coding_tree.h
index 357e059a..5a9b4023 100644
--- a/src/encode_coding_tree.h
+++ b/src/encode_coding_tree.h
@@ -54,7 +54,9 @@ void uvg_encode_coding_tree(
   lcu_coeff_t *coeff,
   enum uvg_tree_type tree_type,
   const cu_loc_t* const cu_loc,
-  split_tree_t split_tree);
+  const cu_loc_t* const chroma_loc,
+  split_tree_t split_tree,
+  bool has_chroma);
 
 void uvg_encode_ts_residual(encoder_state_t* const state,
   cabac_data_t* const cabac,
diff --git a/src/encoder_state-bitstream.c b/src/encoder_state-bitstream.c
index 8e9f7c52..920331a5 100644
--- a/src/encoder_state-bitstream.c
+++ b/src/encoder_state-bitstream.c
@@ -1125,7 +1125,7 @@ static void uvg_encoder_state_write_bitstream_picture_header(
     WRITE_U(stream, 0, 1, "ph_mvd_l1_zero_flag");
   }
 
-  if (encoder->cfg.jccr) {
+  if (encoder->cfg.jccr && encoder->chroma_format != UVG_CSP_400) {
     WRITE_U(stream, state->frame->jccr_sign, 1, "ph_joint_cbcr_sign_flag");
   }
   // END PICTURE HEADER
diff --git a/src/encoderstate.c b/src/encoderstate.c
index e8af6add..6c7517d8 100644
--- a/src/encoderstate.c
+++ b/src/encoderstate.c
@@ -885,11 +885,11 @@ static void encoder_state_worker_encode_lcu_bitstream(void * opaque)
   uvg_cu_loc_ctor(&start, lcu->position.x * LCU_WIDTH, lcu->position.y * LCU_WIDTH, LCU_WIDTH, LCU_WIDTH);
   split_tree_t split_tree = { 0, 0, 0 };
 
-  uvg_encode_coding_tree(state, lcu->coeff, tree_type, &start, split_tree);
+  uvg_encode_coding_tree(state, lcu->coeff, tree_type, &start, &start, split_tree, true);
 
   if(tree_type == UVG_LUMA_T && state->encoder_control->chroma_format != UVG_CSP_400) {
     uvg_cu_loc_ctor(&start, lcu->position.x * LCU_WIDTH_C, lcu->position.y * LCU_WIDTH_C, LCU_WIDTH, LCU_WIDTH);
-    uvg_encode_coding_tree(state, lcu->coeff, UVG_CHROMA_T, &start, split_tree);
+    uvg_encode_coding_tree(state, lcu->coeff, UVG_CHROMA_T, &start, &start, split_tree, true);
   }
 
   if (!state->cabac.only_count) {
diff --git a/src/global.h b/src/global.h
index 87ca92ee..a6a109c5 100644
--- a/src/global.h
+++ b/src/global.h
@@ -128,9 +128,9 @@ typedef int16_t coeff_t;
 
 typedef int32_t mv_t;
 
-//#define VERBOSE 1
+#define VERBOSE 1
 #define UVG_DEBUG_PRINT_CABAC 1
-//#define UVG_DEBUG 1
+#define UVG_DEBUG 1
 
 //#define UVG_DEBUG_PRINT_YUVIEW_CSV 1
 //#define UVG_DEBUG_PRINT_MV_INFO 1
diff --git a/src/intra.c b/src/intra.c
index 398ebc39..5db5abe5 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -916,7 +916,8 @@ static void mip_predict(
 }
 
 
-int8_t uvg_wide_angle_correction(int_fast8_t mode, const bool is_isp, const int log2_width, const int log2_height)
+int8_t uvg_wide_angle_correction(int_fast8_t mode, const bool is_isp, const int log2_width, const int log2_height, const
+                                 bool account_for_dc_planar)
 {
   int8_t pred_mode = mode;
   if (!is_isp && log2_width != log2_height) {
@@ -927,7 +928,7 @@ int8_t uvg_wide_angle_correction(int_fast8_t mode, const bool is_isp, const int
         pred_mode += (66 - 1);
       }
       else if (log2_height > log2_width && mode > 66 - modeShift[deltaSize]) {
-        pred_mode -= (66 - 1);
+        pred_mode -= (66 - 1) + (account_for_dc_planar ? 2 : 0);
       }
     }
   }
@@ -958,7 +959,8 @@ static void intra_predict_regular(
   int8_t pred_mode = uvg_wide_angle_correction(mode,
                                                is_isp,
                                                log2_width,
-                                               log2_height);
+                                               log2_height,
+                                               false);
 
   const uvg_intra_ref *used_ref = &refs->ref;
   if (cfg->intra_smoothing_disabled || color != COLOR_Y || mode == 1 || (width == 4 && height == 4) || multi_ref_index || isp_mode /*ISP_TODO: replace this fake ISP check*/) {
@@ -1817,12 +1819,7 @@ void uvg_intra_recon_cu(
     cur_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y);
   }
 
-  cu_loc_t chroma_cu_loc;
-  if(!recon_luma && recon_chroma) {
-    uvg_cu_loc_ctor(&chroma_cu_loc, cu_loc->x & ~7, cu_loc->y & ~7, width, height);
-    cu_loc = &chroma_cu_loc;
-  }
-  
+   
   // Reset CBFs because CBFs might have been set
   // for depth earlier
   if (recon_luma) {
@@ -1846,7 +1843,7 @@ void uvg_intra_recon_cu(
     }
 
     cu_loc_t split_cu_loc[4];
-    const int split_count = uvg_get_split_locs(cu_loc, split, split_cu_loc);
+    const int split_count = uvg_get_split_locs(cu_loc, split, split_cu_loc,NULL);
     for (int i = 0; i < split_count; ++i) {
       uvg_intra_recon_cu(state, search_data, &split_cu_loc[i], NULL, lcu, tree_type, recon_luma, recon_chroma);
     }
@@ -1876,7 +1873,7 @@ void uvg_intra_recon_cu(
     }
   }
   const bool has_luma = recon_luma && search_data->pred_cu.intra.isp_mode == ISP_MODE_NO_ISP;
-  const bool has_chroma = recon_chroma && (cu_loc->x % 8 == 0 && cu_loc->y % 8 == 0);
+  const bool has_chroma = recon_chroma;
      
   // Process a leaf TU.
   if (has_luma) {
diff --git a/src/intra.h b/src/intra.h
index 7c4c8852..022b8ce1 100644
--- a/src/intra.h
+++ b/src/intra.h
@@ -169,7 +169,8 @@ int8_t uvg_wide_angle_correction(
   int_fast8_t mode,
   const bool is_isp,
   const int log2_width,
-  const int log2_height);
+  const int log2_height,
+  const bool account_for_dc_planar);
 
 // ISP related defines
 #define NUM_ISP_MODES 3
diff --git a/src/search.c b/src/search.c
index 5166e47f..e11d6d15 100644
--- a/src/search.c
+++ b/src/search.c
@@ -380,18 +380,23 @@ double uvg_cu_rd_cost_luma(
 
   if (cu_loc->width > TR_MAX_WIDTH || cu_loc->height > TR_MAX_WIDTH) {
     double sum = 0;
-    const int half_width = cu_loc->width >> 1;
-    const int half_height = cu_loc->height >> 1;
-    cu_loc_t split_cu_loc;
+    // Recursively process sub-CUs.
+    enum split_type split;
+    if (cu_loc->width > TR_MAX_WIDTH && cu_loc->height > TR_MAX_WIDTH) {
+      split = QT_SPLIT;
+    }
+    else if (cu_loc->width > TR_MAX_WIDTH) {
+      split = BT_VER_SPLIT;
+    }
+    else {
+      split = BT_HOR_SPLIT;
+    }
 
-    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x, cu_loc->y, half_width, half_height);
-    sum += uvg_cu_rd_cost_luma(state, &split_cu_loc, pred_cu, lcu, isp_cbf);
-    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x + half_width, cu_loc->y, half_width, half_height);
-    sum += uvg_cu_rd_cost_luma(state, &split_cu_loc, pred_cu, lcu, isp_cbf);
-    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x, cu_loc->y+ half_height, half_width, half_height);
-    sum += uvg_cu_rd_cost_luma(state, &split_cu_loc, pred_cu, lcu, isp_cbf);
-    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x + half_width, cu_loc->y + half_height, half_width, half_height);
-    sum += uvg_cu_rd_cost_luma(state, &split_cu_loc, pred_cu, lcu, isp_cbf);
+    cu_loc_t split_cu_loc[4];
+    const int split_count = uvg_get_split_locs(cu_loc, split, split_cu_loc,NULL);
+    for (int i = 0; i < split_count; ++i) {
+      sum += uvg_cu_rd_cost_luma(state, &split_cu_loc[i], pred_cu, lcu, isp_cbf);
+    }
 
     return sum + tr_tree_bits * state->lambda;
   }
@@ -478,20 +483,12 @@ double uvg_cu_rd_cost_chroma(
   const cu_loc_t * const cu_loc)
 {
   const vector2d_t lcu_px = { (cu_loc->local_x & ~7) / 2, (cu_loc->local_y & ~7) / 2 };
-  const int width = cu_loc->chroma_width;
-  const int height = cu_loc->chroma_height;
   cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y);
   const int skip_residual_coding = pred_cu->skipped || (pred_cu->type != CU_INTRA && pred_cu->cbf == 0);
   
   double tr_tree_bits = 0;
   double coeff_bits = 0;
   
-  if (cu_loc->width == 4 && cu_loc->height == 4 && (cu_loc->x % 8 == 0 || cu_loc->y % 8 == 0)) {
-    // For MAX_PU_DEPTH calculate chroma for previous depth for the first
-    // block and return 0 cost for all others.
-    return 0;
-  }
-
   const int depth = 6 - uvg_g_convert_to_log2[cu_loc->width];
   int u_is_set = pred_cu->joint_cb_cr ? (pred_cu->joint_cb_cr & 2) >> 1 : cbf_is_set(pred_cu->cbf, COLOR_U);
   int v_is_set = pred_cu->joint_cb_cr ? (pred_cu->joint_cb_cr & 1) : cbf_is_set(pred_cu->cbf, COLOR_V);
@@ -499,18 +496,22 @@ double uvg_cu_rd_cost_chroma(
   if (cu_loc->width > TR_MAX_WIDTH || cu_loc->height > TR_MAX_WIDTH) {
     double sum = 0;
     // Recursively process sub-CUs.
-    const int half_width = cu_loc->width >> 1;
-    const int half_height = cu_loc->height >> 1;
-    cu_loc_t split_cu_loc;
+    enum split_type split;
+    if (cu_loc->width > TR_MAX_WIDTH && cu_loc->height > TR_MAX_WIDTH) {
+      split = QT_SPLIT;
+    }
+    else if (cu_loc->width > TR_MAX_WIDTH) {
+      split = BT_VER_SPLIT;
+    }
+    else {
+      split = BT_HOR_SPLIT;
+    }
 
-    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x, cu_loc->y, half_width, half_height);
-    sum += uvg_cu_rd_cost_chroma(state, pred_cu, lcu, &split_cu_loc);
-    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x + half_width, cu_loc->y, half_width, half_height);
-    sum += uvg_cu_rd_cost_chroma(state, pred_cu, lcu, &split_cu_loc);
-    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x, cu_loc->y + half_height, half_width, half_height);
-    sum += uvg_cu_rd_cost_chroma(state, pred_cu, lcu, &split_cu_loc);
-    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x + half_width, cu_loc->y + half_height, half_width, half_height);
-    sum += uvg_cu_rd_cost_chroma(state, pred_cu, lcu, &split_cu_loc);
+    cu_loc_t split_cu_loc[4];
+    const int split_count = uvg_get_split_locs(cu_loc, split, split_cu_loc,NULL);
+    for (int i = 0; i < split_count; ++i) {
+      sum += uvg_cu_rd_cost_chroma(state, pred_cu, lcu, &split_cu_loc[i]);
+    }
 
     return sum + tr_tree_bits * state->lambda;
   }
@@ -544,10 +545,10 @@ double uvg_cu_rd_cost_chroma(
     int index = lcu_px.y * LCU_WIDTH_C + lcu_px.x;
     int ssd_u = uvg_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index],
                                     LCU_WIDTH_C,         LCU_WIDTH_C,
-                                    width);
+                                    cu_loc->chroma_width);
     int ssd_v = uvg_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index],
                                     LCU_WIDTH_C,        LCU_WIDTH_C,
-                                    width);
+                                    cu_loc->chroma_width);
     ssd = ssd_u + ssd_v;
   }
 
@@ -580,7 +581,9 @@ static double cu_rd_cost_tr_split_accurate(
   lcu_t* const lcu,
   enum uvg_tree_type tree_type,
   uint8_t isp_cbf,
-  const cu_loc_t* const cu_loc) {
+  const cu_loc_t* const cu_loc,
+  const cu_loc_t* const chroma_loc,
+  bool has_chroma) {
   const int width = cu_loc->width;
   const int height = cu_loc->height; // TODO: height for non-square blocks
   
@@ -590,8 +593,6 @@ static double cu_rd_cost_tr_split_accurate(
 
   double coeff_bits = 0;
   double tr_tree_bits = 0;
-
-  const int depth = 6 - uvg_g_convert_to_log2[cu_loc->width];
   
   const int cb_flag_u = tr_cu->joint_cb_cr ? tr_cu->joint_cb_cr >> 1 : cbf_is_set(tr_cu->cbf, COLOR_U);
   const int cb_flag_v = tr_cu->joint_cb_cr ? tr_cu->joint_cb_cr & 1 : cbf_is_set(tr_cu->cbf, COLOR_V);
@@ -610,22 +611,24 @@ static double cu_rd_cost_tr_split_accurate(
   
   if (cu_loc->width > TR_MAX_WIDTH || cu_loc->height > TR_MAX_WIDTH) {
     double sum = 0;
-
-    const int half_width = cu_loc->width >> 1;
-    const int half_height = cu_loc->height >> 1;
-    cu_loc_t split_cu_loc;
-    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x, cu_loc->y, half_width, half_height);
-    sum += cu_rd_cost_tr_split_accurate(state, pred_cu, lcu, tree_type, isp_cbf, &split_cu_loc);
-    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x + half_width, cu_loc->y, half_width, half_height);
-    sum += cu_rd_cost_tr_split_accurate(state, pred_cu, lcu, tree_type, isp_cbf, &split_cu_loc);
-    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x, cu_loc->y + half_height, half_width, half_height);
-    sum += cu_rd_cost_tr_split_accurate(state, pred_cu, lcu, tree_type, isp_cbf, &split_cu_loc);
-    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x + half_width, cu_loc->y + half_height, half_width, half_height);
-    sum += cu_rd_cost_tr_split_accurate(state, pred_cu, lcu, tree_type, isp_cbf, &split_cu_loc);
+    enum split_type split;
+    if(cu_loc->width > TR_MAX_WIDTH && cu_loc->height > TR_MAX_WIDTH) {
+      split = QT_SPLIT;
+    } else if(cu_loc->width > TR_MAX_WIDTH) {
+      split = BT_VER_SPLIT;
+    } else {
+      split = BT_HOR_SPLIT;
+    }
+    
+    cu_loc_t split_cu_loc[4];
+    const int split_count= uvg_get_split_locs(cu_loc, split, split_cu_loc,NULL);
+    for (int i = 0; i < split_count; ++i) {
+      sum += cu_rd_cost_tr_split_accurate(state, pred_cu, lcu, tree_type, isp_cbf, &split_cu_loc[i], &split_cu_loc[i], has_chroma);
+    }
     return sum + tr_tree_bits * state->lambda;
   }
 
-  bool has_chroma = state->encoder_control->chroma_format != UVG_CSP_400 && (depth != 4 || (cu_loc->x % 8 && cu_loc->y % 8)) && tree_type != UVG_LUMA_T;
+  has_chroma = state->encoder_control->chroma_format != UVG_CSP_400 && has_chroma && tree_type != UVG_LUMA_T;
   if (!skip_residual_coding && has_chroma) {
     CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_cbf_model_cb[0]), cb_flag_u, tr_tree_bits, "cbf_cb");  
     CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_cbf_model_cr[cb_flag_u]), cb_flag_v, tr_tree_bits, "cbf_cr");    
@@ -712,7 +715,7 @@ static double cu_rd_cost_tr_split_accurate(
     }
   }
 
-  const bool is_local_sep_tree = pred_cu->log2_width + pred_cu->log2_height < 6 && tree_type == UVG_BOTH_T;
+  const bool is_local_sep_tree = (cu_loc->width != chroma_loc->width || cu_loc->height != chroma_loc->height) && state->encoder_control->chroma_format != UVG_CSP_400;
 
   if(is_local_sep_tree || tree_type == UVG_LUMA_T) {
 
@@ -738,11 +741,11 @@ static double cu_rd_cost_tr_split_accurate(
 
   unsigned chroma_ssd = 0;
   if(has_chroma) {
-    cu_loc_t chroma_loc;
-    const vector2d_t lcu_px = { (cu_loc->local_x >> 1) & ~3, (cu_loc->local_y >> 1) &~3  };
-    uvg_cu_loc_ctor(&chroma_loc, lcu_px.x, lcu_px.y, width, height);
-    const int chroma_width  = cu_loc->chroma_width;
-    const int chroma_height = cu_loc->chroma_height; // TODO: height for non-square blocks
+    cu_loc_t temp_chroma_loc;
+    const vector2d_t lcu_px = { chroma_loc->local_x >> 1, chroma_loc->local_y >> 1};
+    uvg_cu_loc_ctor(&temp_chroma_loc, lcu_px.x, lcu_px.y, chroma_loc->width, chroma_loc->height);
+    const int chroma_width  = chroma_loc->chroma_width;
+    const int chroma_height = chroma_loc->chroma_height; 
     int8_t scan_order = SCAN_DIAG;
     //const unsigned index = xy_to_zorder(LCU_WIDTH_C, lcu_px.x, lcu_px.y);
 
@@ -766,8 +769,8 @@ static double cu_rd_cost_tr_split_accurate(
       if(chroma_can_use_tr_skip && cb_flag_v) {
         CABAC_FBITS_UPDATE(cabac, &cabac->ctx.transform_skip_model_chroma, tr_cu->tr_skip & 4, tr_tree_bits, "transform_skip_flag");        
       }
-      coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.u, tr_cu, &chroma_loc, COLOR_U, scan_order, tr_cu->tr_skip & 2, COEFF_ORDER_CU);
-      coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.v, tr_cu, &chroma_loc, COLOR_V, scan_order, tr_cu->tr_skip & 4, COEFF_ORDER_CU);
+      coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.u, tr_cu, &temp_chroma_loc, COLOR_U, scan_order, tr_cu->tr_skip & 2, COEFF_ORDER_CU);
+      coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.v, tr_cu, &temp_chroma_loc, COLOR_V, scan_order, tr_cu->tr_skip & 4, COEFF_ORDER_CU);
       
     }
     else {
@@ -789,7 +792,7 @@ static double cu_rd_cost_tr_split_accurate(
   }
 
   const bool is_chroma_tree = is_local_sep_tree || tree_type == UVG_CHROMA_T;
-  if (uvg_is_lfnst_allowed(state, tr_cu, is_local_sep_tree ? UVG_CHROMA_T : tree_type, is_chroma_tree ? COLOR_UV : COLOR_Y, cu_loc)) {
+  if (uvg_is_lfnst_allowed(state, tr_cu, is_local_sep_tree ? UVG_CHROMA_T : tree_type, is_chroma_tree ? COLOR_UV : COLOR_Y, is_chroma_tree ? cu_loc : chroma_loc)) {
     const int lfnst_idx = is_chroma_tree ? tr_cu->cr_lfnst_idx : tr_cu->lfnst_idx;
     CABAC_FBITS_UPDATE(
       cabac,
@@ -931,10 +934,11 @@ void uvg_sort_keys_by_cost(unit_stats_map_t *__restrict map)
 static double search_cu(
   encoder_state_t* const state,
   const cu_loc_t* const cu_loc,
+  const cu_loc_t* const chroma_loc,
   lcu_t* lcu,
-  enum uvg_tree_type
-  tree_type,
-  const split_tree_t split_tree)
+  enum uvg_tree_type tree_type,
+  const split_tree_t split_tree,
+  bool has_chroma)
 {
   const int depth = split_tree.current_depth;
   const encoder_control_t* ctrl = state->encoder_control;
@@ -1091,9 +1095,8 @@ static double search_cu(
       double intra_cost = intra_search.cost;
       if (intra_cost < cost && tree_type != UVG_LUMA_T) {
         int8_t intra_mode = intra_search.pred_cu.intra.mode;
-
-        // TODO: This heavily relies to square CUs
-        if ((cur_cu->log2_height + cur_cu->log2_width >= 6 || (x % 8 && y % 8) || tree_type == UVG_CHROMA_T)
+        
+        if ((has_chroma || tree_type == UVG_CHROMA_T)
           && state->encoder_control->chroma_format != UVG_CSP_400 && tree_type != UVG_LUMA_T) {
 
           intra_search.pred_cu.joint_cb_cr = 0;
@@ -1104,7 +1107,7 @@ static double search_cu(
           }
           intra_search.pred_cu.intra.mode_chroma = intra_search.pred_cu.intra.mode;
           if (ctrl->cfg.rdo >= 2 || ctrl->cfg.jccr || ctrl->cfg.lfnst) {
-            uvg_search_cu_intra_chroma(state, cu_loc, lcu, &intra_search, tree_type);
+            uvg_search_cu_intra_chroma(state, chroma_loc, lcu, &intra_search, tree_type, cu_loc->x != chroma_loc->x || cu_loc->y != chroma_loc->y);
 
             if (intra_search.pred_cu.joint_cb_cr == 0) {
               intra_search.pred_cu.joint_cb_cr = 4;
@@ -1118,13 +1121,13 @@ static double search_cu(
             intra_search.pred_cu.intra.mode_chroma = 0;
           }
           uvg_intra_recon_cu(state,
-                             &intra_search, cu_loc,
+                             &intra_search, chroma_loc,
                              &intra_search.pred_cu, lcu,
                              tree_type,
                              false,
                              true);
           if(tree_type != UVG_CHROMA_T) {
-            intra_cost += uvg_cu_rd_cost_chroma(state, &intra_search.pred_cu, lcu, cu_loc);
+            intra_cost += uvg_cu_rd_cost_chroma(state, &intra_search.pred_cu, lcu, chroma_loc);
           }
           else {
             intra_cost = intra_search.cost;
@@ -1178,7 +1181,7 @@ static double search_cu(
 
       bool recon_chroma = true;
       bool recon_luma = tree_type != UVG_CHROMA_T;
-      if ((cur_cu->log2_height + cur_cu->log2_width < 6) || state->encoder_control->chroma_format == UVG_CSP_400 || tree_type == UVG_LUMA_T) {
+      if ((cur_cu->log2_height + cur_cu->log2_width < 6) || !has_chroma || state->encoder_control->chroma_format == UVG_CSP_400 || tree_type == UVG_LUMA_T) {
         recon_chroma = false; 
       }
       lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_height, cur_cu);
@@ -1189,12 +1192,12 @@ static double search_cu(
                          recon_luma, recon_chroma);
 
 
-      if((cur_cu->log2_height + cur_cu->log2_width < 6 && x % 8 && y % 8 && tree_type != UVG_LUMA_T && state->encoder_control->chroma_format != UVG_CSP_400 ) 
+      if((cur_cu->log2_height + cur_cu->log2_width < 6 && has_chroma && tree_type != UVG_LUMA_T && state->encoder_control->chroma_format != UVG_CSP_400 ) 
         || tree_type == UVG_CHROMA_T) {
         intra_search.pred_cu.intra.mode_chroma = cur_cu->intra.mode_chroma;
         uvg_intra_recon_cu(state,
-                           &intra_search, cu_loc,
-                           NULL, lcu,
+                           &intra_search, chroma_loc,
+                           cur_cu, lcu,
                            tree_type,
                            false,
                            true);
@@ -1279,9 +1282,9 @@ static double search_cu(
   // The cabac functions assume chroma locations whereas the search uses luma locations
   // for the chroma tree, therefore we need to shift the chroma coordinates here for
   // passing to the bit cost calculating functions.
-  cu_loc_t chroma_loc = *cu_loc;
-  chroma_loc.y >>= 1;
-  chroma_loc.x >>= 1;
+  cu_loc_t separate_tree_chroma_loc = *cu_loc;
+  separate_tree_chroma_loc.y >>= 1;
+  separate_tree_chroma_loc.x >>= 1;
 
   if (cur_cu->type == CU_INTRA || cur_cu->type == CU_INTER) {
     double bits = 0;
@@ -1291,7 +1294,7 @@ static double search_cu(
     bits += uvg_mock_encode_coding_unit(
       state,
       cabac,
-      tree_type != UVG_CHROMA_T ? cu_loc : &chroma_loc, 
+      tree_type != UVG_CHROMA_T ? cu_loc : &separate_tree_chroma_loc,
       lcu,
       cur_cu,
       tree_type,
@@ -1300,7 +1303,7 @@ static double search_cu(
     
     cost = bits * state->lambda;
 
-    cost += cu_rd_cost_tr_split_accurate(state, cur_cu, lcu, tree_type, 0, cu_loc);
+    cost += cu_rd_cost_tr_split_accurate(state, cur_cu, lcu, tree_type, 0, cu_loc, chroma_loc, has_chroma);
     
     //if (ctrl->cfg.zero_coeff_rdo && inter_zero_coeff_cost <= cost) {
     //  cost = inter_zero_coeff_cost;
@@ -1335,7 +1338,7 @@ static double search_cu(
 
   // Recursively split all the way to max search depth.
   if (can_split_cu) {
-    const int split_type = depth == 0 ? QT_SPLIT : TT_HOR_SPLIT;
+    const int split_type = depth == 2 ? TT_HOR_SPLIT : QT_SPLIT;
     const split_tree_t new_split = {
       split_tree.split_tree | split_type << (split_tree.current_depth * 3),
       split_tree.current_depth + 1,
@@ -1378,7 +1381,7 @@ static double search_cu(
         &state->search_cabac,
         left_cu,
         above_cu, 
-        tree_type != UVG_CHROMA_T ? cu_loc : &chroma_loc,
+        tree_type != UVG_CHROMA_T ? cu_loc : &separate_tree_chroma_loc,
         split_tree,
         tree_type,
         &split_bits);
@@ -1393,11 +1396,17 @@ static double search_cu(
     // It is ok to interrupt the search as soon as it is known that
     // the split costs at least as much as not splitting.
     if (cur_cu->type == CU_NOTSET || cbf || state->encoder_control->cfg.cu_split_termination == UVG_CU_SPLIT_TERMINATION_OFF) {
-      initialize_partial_work_tree(lcu, &split_lcu, cu_loc, tree_type);
       cu_loc_t new_cu_loc[4];
-      const int splits = uvg_get_split_locs(cu_loc, split_type, new_cu_loc);
+      uint8_t separate_chroma = 0;
+      const int splits = uvg_get_split_locs(cu_loc, split_type, new_cu_loc, &separate_chroma);
+      initialize_partial_work_tree(lcu, &split_lcu, cu_loc, tree_type);
       for (int split = 0; split < splits; ++split) {
-        split_cost += search_cu(state, &new_cu_loc[split], &split_lcu, tree_type, new_split);
+        split_cost += search_cu(state, 
+          &new_cu_loc[split], separate_chroma ? cu_loc : &new_cu_loc[split],
+          &split_lcu, 
+          tree_type, new_split,
+          !separate_chroma || split == splits - 1);
+        // If there is no separate chroma the block will always have chroma, otherwise it is the last block of the split that has the chroma
         if (split_cost > cost) {
           break;
         }
@@ -1460,7 +1469,7 @@ static double search_cu(
         double mode_bits = calc_mode_bits(state, lcu, cur_cu, cu_loc) + bits;
         cost += mode_bits * state->lambda;
 
-        cost += cu_rd_cost_tr_split_accurate(state, cur_cu, lcu, tree_type, 0, cu_loc);
+        cost += cu_rd_cost_tr_split_accurate(state, cur_cu, lcu, tree_type, 0, cu_loc, chroma_loc, has_chroma);
 
         memcpy(&post_seach_cabac, &state->search_cabac, sizeof(post_seach_cabac));
         memcpy(&state->search_cabac, &temp_cabac, sizeof(temp_cabac));
@@ -1724,9 +1733,11 @@ void uvg_search_lcu(encoder_state_t * const state, const int x, const int y, con
   double cost = search_cu(
     state, 
     &start,
+    NULL,
     &work_tree,
     tree_type,
-    split_tree);
+    split_tree,
+    false);
 
   // Save squared cost for rate control.
   if(state->encoder_control->cfg.rc_algorithm == UVG_LAMBDA) {
@@ -1743,8 +1754,10 @@ void uvg_search_lcu(encoder_state_t * const state, const int x, const int y, con
   if(state->frame->slicetype == UVG_SLICE_I && state->encoder_control->cfg.dual_tree) {
     cost = search_cu(
       state, &start,
-      &work_tree,
-      UVG_CHROMA_T, split_tree);
+      NULL,
+      &work_tree, UVG_CHROMA_T,
+      split_tree,
+      false);
 
     if (state->encoder_control->cfg.rc_algorithm == UVG_LAMBDA) {
       uvg_get_lcu_stats(state, x / LCU_WIDTH, y / LCU_WIDTH)->weight += cost * cost;
diff --git a/src/search_intra.c b/src/search_intra.c
index 07826cec..92c4903f 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -590,7 +590,7 @@ static double search_intra_trdepth(
     }
 
     cu_loc_t split_cu_loc[4];
-    const int split_count = uvg_get_split_locs(cu_loc, split, split_cu_loc);
+    const int split_count = uvg_get_split_locs(cu_loc, split, split_cu_loc,NULL);
     for (int i = 0; i < split_count; ++i) {
       split_cost += search_intra_trdepth(state, &split_cu_loc[i], nosplit_cost, search_data, lcu, tree_type);
     }
@@ -1418,10 +1418,11 @@ int8_t uvg_search_intra_chroma_rdo(
   encoder_state_t * const state,
   int8_t num_modes,
   lcu_t *const lcu,
+  const cu_loc_t* const cu_loc,
   intra_search_data_t* chroma_data,
   int8_t luma_mode,
   enum uvg_tree_type tree_type,
-  const cu_loc_t* const cu_loc)
+  bool is_separate)
 {
   const bool reconstruct_chroma = true;
   
@@ -1446,7 +1447,7 @@ int8_t uvg_search_intra_chroma_rdo(
     const int offset = ((cu_loc->local_x & ~7) >> 1) + ((cu_loc->local_y & ~7) >> 1)* LCU_WIDTH_C;
 
     int lfnst_modes_to_check[3];
-    if((cu_loc->width == 4 || tree_type == UVG_CHROMA_T) && state->encoder_control->cfg.lfnst) {
+    if((is_separate || tree_type == UVG_CHROMA_T) && state->encoder_control->cfg.lfnst) {
       for (int i = 0; i < 3; ++i) {
         lfnst_modes_to_check[i] = i;
       }
@@ -1528,7 +1529,7 @@ int8_t uvg_search_intra_chroma_rdo(
             u_resi,
             v_resi,
             &chorma_ts_out,
-            tree_type);
+            is_separate ? UVG_CHROMA_T : tree_type);
 
           // LFNST constraint failed
           if(chorma_ts_out.best_u_index == -1 && chorma_ts_out.best_combined_index == -1) {
@@ -1590,7 +1591,8 @@ int8_t uvg_search_cu_intra_chroma(
   const cu_loc_t* const cu_loc,
   lcu_t *lcu,
   intra_search_data_t *search_data,
-  enum uvg_tree_type tree_type)
+  enum uvg_tree_type tree_type,
+  bool is_separate)
 {
 
   const cu_info_t *cur_pu = &search_data->pred_cu;
@@ -1604,9 +1606,7 @@ int8_t uvg_search_cu_intra_chroma(
       break;
     }
   }
-
-  cu_loc_t chroma_loc;
-  uvg_cu_loc_ctor(&chroma_loc, cu_loc->x & ~7, cu_loc->y & ~7, cu_loc->width, cu_loc->height);
+  
 
   // The number of modes to select for slower chroma search. Luma mode
   // is always one of the modes, so 2 means the final decision is made
@@ -1638,11 +1638,11 @@ int8_t uvg_search_cu_intra_chroma(
 
     num_modes = search_intra_chroma_rough(state, chroma_data, lcu, intra_mode,
                                           tree_type,
-                                          &chroma_loc);
+                                          cu_loc);
   }
   
   if (num_modes > 1 || state->encoder_control->cfg.jccr) {
-    uvg_search_intra_chroma_rdo(state, num_modes, lcu, chroma_data, intra_mode, tree_type, &chroma_loc);
+    uvg_search_intra_chroma_rdo(state, num_modes, lcu, cu_loc, chroma_data, intra_mode, tree_type, is_separate);
   }
   else if(cur_pu->lfnst_idx) {
     chroma_data[0].pred_cu.cr_lfnst_idx = cur_pu->lfnst_idx;
diff --git a/src/search_intra.h b/src/search_intra.h
index faa26ff1..390187b2 100644
--- a/src/search_intra.h
+++ b/src/search_intra.h
@@ -55,7 +55,8 @@ int8_t uvg_search_cu_intra_chroma(
   const cu_loc_t* const cu_loc,
   lcu_t *lcu,
   intra_search_data_t* best_cclm,
-  enum uvg_tree_type tree_type);
+  enum uvg_tree_type tree_type,
+  bool is_separate);
 
 void uvg_search_cu_intra(
   encoder_state_t * const state,
diff --git a/src/strategies/generic/dct-generic.c b/src/strategies/generic/dct-generic.c
index c352b395..d5fdb88e 100644
--- a/src/strategies/generic/dct-generic.c
+++ b/src/strategies/generic/dct-generic.c
@@ -2586,7 +2586,7 @@ static void mts_dct_generic(
     //const int log2_width_minus2 = uvg_g_convert_to_bit[width];
     //const int log2_height_minus2 = uvg_g_convert_to_bit[height];
 
-    if(tu->lfnst_idx || tu->cr_lfnst_idx) {
+    if((tu->lfnst_idx && color == COLOR_Y) || (tu->cr_lfnst_idx && color != COLOR_Y)) {
       if ((width == 4 && height > 4) || (width > 4 && height == 4))
       {
         skip_width = width - 4;
@@ -2639,7 +2639,7 @@ static void mts_idct_generic(
     const int log2_width_minus1  = uvg_g_convert_to_log2[width] - 1;
     const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
 
-    if (tu->lfnst_idx || tu->cr_lfnst_idx) {
+    if ((tu->lfnst_idx && color == COLOR_Y) || (tu->cr_lfnst_idx && color != COLOR_Y)) {
       if ((width == 4 && height > 4) || (width > 4 && height == 4)) {
         skip_width = width - 4;
         skip_height = height - 4;
diff --git a/src/strategies/generic/intra-generic.c b/src/strategies/generic/intra-generic.c
index 84373d21..01364ab1 100644
--- a/src/strategies/generic/intra-generic.c
+++ b/src/strategies/generic/intra-generic.c
@@ -66,8 +66,7 @@ static void uvg_angular_pred_generic(
   const int log2_width  = uvg_g_convert_to_log2[width];
   const int log2_height = uvg_g_convert_to_log2[height];
   
-  // Log2_dim 1 is possible with ISP blocks
-  assert((log2_width >= 1 && log2_width <= 5) && (log2_height >= 1 && log2_height <= 5));
+  assert((log2_width >= 2 && log2_width <= 5) &&  log2_height <= 5);
   // assert(intra_mode >= 2 && intra_mode <= 66);
 
   static const int16_t modedisp2sampledisp[32] = { 0,    1,    2,    3,    4,    6,     8,   10,   12,   14,   16,   18,   20,   23,   26,   29,   32,   35,   39,  45,  51,  57,  64,  73,  86, 102, 128, 171, 256, 341, 512, 1024 };
@@ -249,7 +248,7 @@ static void uvg_angular_pred_generic(
 
      
       // PDPC
-      bool PDPC_filter = ((tmp_width >= TR_MIN_WIDTH && tmp_height >= TR_MIN_WIDTH)  || channel_type != 0);
+      bool PDPC_filter = ((tmp_width >= TR_MIN_WIDTH && tmp_height >= TR_MIN_WIDTH)  || channel_type != 0) && multi_ref_index == 0;
       if (pred_mode > 1 && pred_mode < 67) {
         if (mode_disp < 0 || multi_ref_index) { // Cannot be used with MRL.
           PDPC_filter = false;
diff --git a/src/transform.c b/src/transform.c
index 968ae440..34514e82 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -574,7 +574,7 @@ void uvg_chroma_transform_search(
       pred_cu->cr_lfnst_idx);
       if(pred_cu->cr_lfnst_idx !=0 && !u_has_coeffs && !v_has_coeffs) continue;
     
-    if(pred_cu->type == CU_INTRA && transforms[i] != CHROMA_TS && (cu_loc->width == 4 || tree_type == UVG_CHROMA_T)) {
+    if(pred_cu->type == CU_INTRA && transforms[i] != CHROMA_TS && tree_type == UVG_CHROMA_T) {
       bool constraints[2] = { false, false };
       uvg_derive_lfnst_constraints(pred_cu, constraints, u_quant_coeff, width, height, NULL, COLOR_U);
       if(!is_jccr) {
@@ -863,6 +863,8 @@ void uvg_fwd_lfnst(
   const uint32_t log2_height = uvg_g_convert_to_log2[height];
   int8_t intra_mode = (color == COLOR_Y) ? cur_cu->intra.mode : cur_cu->intra.mode_chroma;
   bool mts_skip = cur_cu->tr_idx == MTS_SKIP;
+  // This check is safe for 8x16 cus split with TT, since it is checking the dimensions of the
+  // last luma CU which will be 8x4, i.e., 3 + 2 < 6
   bool is_separate_tree = cur_cu->log2_height + cur_cu->log2_width < 6 || tree_type != UVG_BOTH_T;
   bool is_cclm_mode = (intra_mode >= 81 && intra_mode <= 83); // CCLM modes are in [81, 83]
 
@@ -879,12 +881,12 @@ void uvg_fwd_lfnst(
     if (is_cclm_mode) {
       intra_mode = cur_cu->intra.mode;
     }
-    if (is_mip) {
+    if (is_mip && color == COLOR_Y) {
       intra_mode = 0; // Set to planar mode
     }
     assert(intra_mode < NUM_INTRA_MODE && "LFNST: Invalid intra mode.");
     assert(lfnst_index < 3 && "LFNST: Invalid LFNST index. Must be in [0, 2]");
-    int32_t wide_adjusted_mode = uvg_wide_angle_correction(intra_mode, cur_cu->intra.isp_mode != 0, log2_width, log2_height);
+    int32_t wide_adjusted_mode = uvg_wide_angle_correction(intra_mode, cur_cu->intra.isp_mode != 0, log2_width, log2_height, true);
     
     // Transform wide angle mode to intra mode
     intra_mode = get_lfnst_intra_mode(wide_adjusted_mode);
@@ -1010,12 +1012,12 @@ void uvg_inv_lfnst(
     if (is_cclm_mode) {
       intra_mode = cur_cu->intra.mip_flag ? 0 : cur_cu->intra.mode;
     }
-    if (is_mip) {
+    if (is_mip && color == COLOR_Y) {
       intra_mode = 0; // Set to planar mode
     }
     assert(intra_mode < NUM_INTRA_MODE && "LFNST: Invalid intra mode.");
     assert(lfnst_index < 3 && "LFNST: Invalid LFNST index. Must be in [0, 2]");
-    int32_t wide_adjusted_mode = uvg_wide_angle_correction(intra_mode, cur_cu->intra.isp_mode != 0, log2_width, log2_height);
+    int32_t wide_adjusted_mode = uvg_wide_angle_correction(intra_mode, cur_cu->intra.isp_mode != 0, log2_width, log2_height, true);
 
     
     intra_mode = get_lfnst_intra_mode(wide_adjusted_mode); 
@@ -1175,6 +1177,7 @@ static void quantize_tr_residual(
                            cur_pu->log2_width + cur_pu-> log2_height < 6&&
                            (x % 4 != 0 || y % 4 != 0);
   if (handled_elsewhere) {
+    assert(0);
     return;
   }
 
@@ -1413,7 +1416,7 @@ void uvg_quantize_lcu_residual(
 
     cu_loc_t split_cu_loc[4];
     uint16_t child_cbfs[3];
-    const int split_count = uvg_get_split_locs(cu_loc, split, split_cu_loc);
+    const int split_count = uvg_get_split_locs(cu_loc, split, split_cu_loc,NULL);
     for (int i = 0; i < split_count; ++i) {
       uvg_quantize_lcu_residual(state, luma, chroma, 0, &split_cu_loc[i], NULL, lcu, early_skip, tree_type);
       if(i != 0) {

From 43a710e104f656f31be190aeab1f061f8a2b5fd0 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Thu, 24 Nov 2022 09:47:36 +0200
Subject: [PATCH 123/254] fix rebase

---
 src/encode_coding_tree.c               | 42 ++++++++++----------------
 src/intra.c                            | 10 +++---
 src/search.c                           |  8 ++---
 src/strategies/generic/intra-generic.c | 14 ++++-----
 4 files changed, 32 insertions(+), 42 deletions(-)

diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index c908449d..2790932f 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -129,8 +129,7 @@ bool uvg_is_lfnst_allowed(
     }
     bool luma_flag = tree_type != UVG_CHROMA_T;
     bool chroma_flag = tree_type != UVG_LUMA_T;
-    bool non_zero_coeff_non_ts_corner_8x8 = false;
-    bool last_scan_pos = false;
+    bool non_zero_coeff_non_ts_corner_8x8 = (luma_flag && pred_cu->violates_lfnst_constrained_luma) || (chroma_flag && pred_cu->violates_lfnst_constrained_chroma);
     bool is_tr_skip = false;
     
     if (color == COLOR_Y && pred_cu->tr_idx == MTS_SKIP) {
@@ -177,10 +176,10 @@ static bool encode_lfnst_idx(
     return true;
   }
   else {
-    if(depth != 4 || color == COLOR_Y) {
+    if(color == COLOR_Y) {
       assert(pred_cu->lfnst_idx == 0);
     }
-    if(depth == 4 && color != COLOR_Y) {
+    if(tree_type == UVG_CHROMA_T && color != COLOR_Y) {
       assert(pred_cu->cr_lfnst_idx == 0);
     }
     return false;
@@ -612,7 +611,8 @@ static void encode_transform_coeff(
   cabac_data_t * const cabac = &state->cabac;
 
   bool isp_split = cu_loc->x != original_loc->x || cu_loc->y != original_loc->y;
-
+  int x = cu_loc->x;
+  int y = cu_loc->y;
   if (isp_split) {
     uvg_get_isp_cu_arr_coords(&x, &y);
   }
@@ -621,7 +621,7 @@ static void encode_transform_coeff(
   const videoframe_t * const frame = state->tile->frame;
   const cu_array_t* used_array = tree_type != UVG_CHROMA_T ? frame->cu_array : frame->chroma_cu_array;
   if(cur_tu == NULL) {
-    cur_tu = uvg_cu_array_at_const(used_array, cu_loc->x, cu_loc->y);
+    cur_tu = uvg_cu_array_at_const(used_array, x, y);
   }
 
   const bool ver_split = cu_loc->height > TR_MAX_WIDTH;
@@ -647,36 +647,25 @@ static void encode_transform_coeff(
     cu_loc_t split_cu_loc[4];
     const int split_count = uvg_get_split_locs(cu_loc, split, split_cu_loc,NULL);
     for (int i = 0; i < split_count; ++i) {
-      encode_transform_coeff(state, &split_cu_loc[i], only_chroma, coeff, NULL, tree_type, true, luma_cbf_ctx, &split_cu_loc[i]);
+      encode_transform_coeff(state, &split_cu_loc[i], only_chroma, coeff, NULL, tree_type, true, false, luma_cbf_ctx, &split_cu_loc[i]);
     }
     return;
   }
+
+
   // Chroma cb flags are not signaled when one of the following:
   // No chroma.
   // Not the last CU for area of 64 pixels cowered by more than one luma CU.
   // Not the last ISP Split
-  if (state->encoder_control->chroma_format != UVG_CSP_400 
+  if (state->encoder_control->chroma_format != UVG_CSP_400
     && (cur_tu->log2_height + cur_tu->log2_width >= 6 || only_chroma)
-    && tree_type != UVG_LUMA_T 
+    && tree_type != UVG_LUMA_T
     && last_split) {
     cabac->cur_ctx = &(cabac->ctx.qt_cbf_model_cb[0]);
     CABAC_BIN(cabac, cb_flag_u, "cbf_cb");
     cabac->cur_ctx = &(cabac->ctx.qt_cbf_model_cr[cb_flag_u ? 1 : 0]);
-    CABAC_BIN(cabac, cb_flag_v, "cbf_cr");    
+    CABAC_BIN(cabac, cb_flag_v, "cbf_cr");
   }
-
-
-    for (int j = 0; j < 2; j++) {
-      for (int i = 0; i < 2; i++) {
-        cu_loc_t loc;
-        uvg_cu_loc_ctor(&loc, (x + i * split_width), (y + j * split_height), width >> 1, height >> 1);
-
-        encode_transform_coeff(state, &loc, depth + 1, tr_depth + 1, cb_flag_u, cb_flag_v, only_chroma, coeff, tree_type, true, false, luma_cbf_ctx, &loc);
-      }
-    }
-    return;
-  }
-
   // Luma coded block flag is signaled when one of the following:
   // - prediction mode is intra
   // - transform depth > 0
@@ -1417,7 +1406,7 @@ void uvg_encode_coding_tree(
       const int splits = uvg_get_split_locs(cu_loc, split_flag, new_cu_loc, &separate_chroma);
       for (int split = 0; split <splits; ++split) {
         uvg_encode_coding_tree(state, coeff, tree_type,
-          &new_cu_loc[split], separate_chroma ? cu_loc : &new_cu_loc[split],
+          &new_cu_loc[split], separate_chroma ? chroma_loc : &new_cu_loc[split],
           new_split_tree, !separate_chroma || split == splits - 1);
       }
       return;
@@ -1641,8 +1630,9 @@ void uvg_encode_coding_tree(
     encode_mts_idx(state, cabac, cur_cu, cu_loc);
 
     // For 4x4 the chroma PU/TU is coded after the last 
-    if (state->encoder_control->chroma_format != UVG_CSP_400 && 
-      (has_chroma || tree_type == UVG_CHROMA_T) &&
+    if (state->encoder_control->chroma_format != UVG_CSP_400 &&
+      (((chroma_loc->width != cu_loc->width || chroma_loc->height != cu_loc->height)&&
+      has_chroma) || tree_type == UVG_CHROMA_T) &&
       tree_type != UVG_LUMA_T)   {
       encode_chroma_intra_cu(cabac, cur_cu, state->encoder_control->cfg.cclm, NULL);
       // LFNST constraints must be reset here. Otherwise the left over values will interfere when calculating new constraints
diff --git a/src/intra.c b/src/intra.c
index 5db5abe5..bb696c0c 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -957,7 +957,7 @@ static void intra_predict_regular(
 
   // Wide angle correction
   int8_t pred_mode = uvg_wide_angle_correction(mode,
-                                               is_isp,
+                                               isp_mode,
                                                log2_width,
                                                log2_height,
                                                false);
@@ -1859,15 +1859,15 @@ void uvg_intra_recon_cu(
 
     for (int i = 0; i < split_limit; ++i) {
       cu_loc_t tu_loc;
-      uvg_get_isp_split_loc(&tu_loc, x, y, width, height, i, split_type, true);
+      uvg_get_isp_split_loc(&tu_loc,  cu_loc->x, cu_loc->y, width, height, i, split_type, true);
       cu_loc_t pu_loc;
-      uvg_get_isp_split_loc(&pu_loc, x, y, width, height, i, split_type, false);
+      uvg_get_isp_split_loc(&pu_loc, cu_loc->x, cu_loc->y, width, height, i, split_type, false);
 
       if(tu_loc.x % 4 == 0) {
-        intra_recon_tb_leaf(state, &pu_loc, &origin_cu, lcu, COLOR_Y, search_data, tree_type);
+        intra_recon_tb_leaf(state, &pu_loc, cu_loc, lcu, COLOR_Y, search_data, tree_type);
       }
       uvg_quantize_lcu_residual(state, true, false, false,
-        &tu_loc, depth, cur_cu, lcu,
+        &tu_loc, cur_cu, lcu,
         false, tree_type);
       search_data->best_isp_cbfs |= cbf_is_set(cur_cu->cbf, COLOR_Y) << i;
     }
diff --git a/src/search.c b/src/search.c
index e11d6d15..58c190d0 100644
--- a/src/search.c
+++ b/src/search.c
@@ -423,9 +423,9 @@ double uvg_cu_rd_cost_luma(
   }
   else {
     // TODO: 8x4 CUs
-    const int split_limit = uvg_get_isp_split_num(width, height, pred_cu->intra.isp_mode, true);
+    const int split_limit = uvg_get_isp_split_num(cu_loc->width, cu_loc->height, pred_cu->intra.isp_mode, true);
+    int luma_ctx = 2;
     for (int i = 0; i < split_limit; i++) {
-      int luma_ctx = 2;
       if (i != 3 && isp_cbf != 0x8) {
         const int flag = (isp_cbf >> i) & 1;
         CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_cbf_model_luma[luma_ctx]), flag, tr_tree_bits, "cbf_y_search");
@@ -1338,7 +1338,7 @@ static double search_cu(
 
   // Recursively split all the way to max search depth.
   if (can_split_cu) {
-    const int split_type = depth == 2 ? TT_HOR_SPLIT : QT_SPLIT;
+    const int split_type = depth == 2 ? BT_HOR_SPLIT : QT_SPLIT;
     const split_tree_t new_split = {
       split_tree.split_tree | split_type << (split_tree.current_depth * 3),
       split_tree.current_depth + 1,
@@ -1402,7 +1402,7 @@ static double search_cu(
       initialize_partial_work_tree(lcu, &split_lcu, cu_loc, tree_type);
       for (int split = 0; split < splits; ++split) {
         split_cost += search_cu(state, 
-          &new_cu_loc[split], separate_chroma ? cu_loc : &new_cu_loc[split],
+          &new_cu_loc[split], separate_chroma ? chroma_loc : &new_cu_loc[split],
           &split_lcu, 
           tree_type, new_split,
           !separate_chroma || split == splits - 1);
diff --git a/src/strategies/generic/intra-generic.c b/src/strategies/generic/intra-generic.c
index 01364ab1..9a3cbe26 100644
--- a/src/strategies/generic/intra-generic.c
+++ b/src/strategies/generic/intra-generic.c
@@ -192,7 +192,7 @@ static void uvg_angular_pred_generic(
   if (sample_disp != 0) {
     // The mode is not horizontal or vertical, we have to do interpolation.
 
-    for (int_fast32_t y = 0, delta_pos = sample_disp * (1 + multi_ref_index); y < tmp_height; ++y, delta_pos += sample_disp) {
+    for (int_fast32_t y = 0, delta_pos = sample_disp * (1 + multi_ref_index); y < height; ++y, delta_pos += sample_disp) {
       int_fast32_t delta_int = delta_pos >> 5;
       int_fast32_t delta_fract = delta_pos & (32 - 1);
 
@@ -219,7 +219,7 @@ static void uvg_angular_pred_generic(
           const int16_t filter_coeff[4] = { 16 - (delta_fract >> 1), 32 - (delta_fract >> 1), 16 + (delta_fract >> 1), delta_fract >> 1 };
           int16_t const * const f = use_cubic ? cubic_filter[delta_fract] : filter_coeff;
           // Do 4-tap intra interpolation filtering
-          for (int_fast32_t x = 0; x < tmp_width; x++, ref_main_index++) {
+          for (int_fast32_t x = 0; x < width; x++, ref_main_index++) {
             p[0] = ref_main[ref_main_index];
             p[1] = ref_main[ref_main_index + 1];
             p[2] = ref_main[ref_main_index + 2];
@@ -232,7 +232,7 @@ static void uvg_angular_pred_generic(
         else {
         
           // Do linear filtering
-          for (int_fast32_t x = 0; x < tmp_width; ++x) {
+          for (int_fast32_t x = 0; x < width; ++x) {
             uvg_pixel ref1 = ref_main[x + delta_int + 1];
             uvg_pixel ref2 = ref_main[x + delta_int + 2];
             work[y * width + x] = ref1 + ((delta_fract * (ref2-ref1) + 16) >> 5);
@@ -248,7 +248,7 @@ static void uvg_angular_pred_generic(
 
      
       // PDPC
-      bool PDPC_filter = ((tmp_width >= TR_MIN_WIDTH && tmp_height >= TR_MIN_WIDTH)  || channel_type != 0) && multi_ref_index == 0;
+      bool PDPC_filter = ((width >= TR_MIN_WIDTH && height >= TR_MIN_WIDTH)  || channel_type != 0) && multi_ref_index == 0;
       if (pred_mode > 1 && pred_mode < 67) {
         if (mode_disp < 0 || multi_ref_index) { // Cannot be used with MRL.
           PDPC_filter = false;
@@ -259,7 +259,7 @@ static void uvg_angular_pred_generic(
       }
       if(PDPC_filter) {
         int inv_angle_sum = 256;
-        for (int x = 0; x < MIN(3 << scale, tmp_width); x++) {
+        for (int x = 0; x < MIN(3 << scale, width); x++) {
           inv_angle_sum += modedisp2invsampledisp[abs(mode_disp)];
 
           int wL = 32 >> (2 * x >> scale);
@@ -274,7 +274,7 @@ static void uvg_angular_pred_generic(
     
     // Do not apply PDPC if multi ref line index is other than 0
     // TODO: do not do PDPC if block is in BDPCM mode
-    bool do_pdpc = (((tmp_width >= 4 && tmp_height >= 4) || channel_type != 0) && sample_disp >= 0 && multi_ref_index == 0 /*&& !bdpcm*/);
+    bool do_pdpc = (((width >= 4 && height >= 4) || channel_type != 0) && sample_disp >= 0 && multi_ref_index == 0 /*&& !bdpcm*/);
 
     if (do_pdpc) {
       int scale = (log2_width + log2_height - 2) >> 2;
@@ -282,7 +282,7 @@ static void uvg_angular_pred_generic(
       for (int_fast32_t y = 0; y < height; ++y) {
         memcpy(&work[y * width], &ref_main[1], width * sizeof(uvg_pixel));
         const uvg_pixel left = ref_side[1 + y];
-        for (int_fast32_t x = 0; x < MIN(3 << scale, tmp_width); ++x) {
+        for (int_fast32_t x = 0; x < MIN(3 << scale, width); ++x) {
           const int wL = 32 >> (2 * x >> scale);
           const uvg_pixel val = work[y * width + x];
           work[y * width + x] = CLIP_TO_PIXEL(val + ((wL * (left - top_left) + 32) >> 6));

From 7b117f171f4aeeceef7a960fdda672f8c02e7bbd Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Thu, 24 Nov 2022 15:04:57 +0200
Subject: [PATCH 124/254] [mtt] WIP 16x16 TT split

---
 src/encode_coding_tree.c               |  7 ++++---
 src/encode_coding_tree.h               |  1 +
 src/intra.c                            |  3 ++-
 src/search.c                           | 29 ++++++++++++++++----------
 src/search_inter.c                     |  4 ++--
 src/search_intra.c                     |  8 +++----
 src/strategies/generic/intra-generic.c |  6 +++---
 7 files changed, 34 insertions(+), 24 deletions(-)

diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index 2790932f..ce47f566 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -452,8 +452,8 @@ static void encode_chroma_tu(
 {
   int width_c = cu_loc->chroma_width;
   int height_c = cu_loc->chroma_height;
-  int x_local = ((cu_loc->x >> (tree_type != UVG_CHROMA_T)) & ~3) % LCU_WIDTH_C;
-  int y_local = ((cu_loc->y >> (tree_type != UVG_CHROMA_T)) & ~3) % LCU_WIDTH_C;
+  int x_local = (cu_loc->x >> (tree_type != UVG_CHROMA_T)) % LCU_WIDTH_C;
+  int y_local = (cu_loc->y >> (tree_type != UVG_CHROMA_T)) % LCU_WIDTH_C;
   cabac_data_t* const cabac = &state->cabac;
   *scan_idx = SCAN_DIAG;
   if(!joint_chroma){
@@ -1668,6 +1668,7 @@ double uvg_mock_encode_coding_unit(
   encoder_state_t* const state,
   cabac_data_t* cabac,
   const cu_loc_t* const cu_loc,
+  const cu_loc_t* const chroma_loc,
   lcu_t* lcu,
   cu_info_t* cur_cu,
   enum uvg_tree_type tree_type,
@@ -1780,7 +1781,7 @@ double uvg_mock_encode_coding_unit(
     if(tree_type != UVG_CHROMA_T) {
       uvg_encode_intra_luma_coding_unit(state, cabac, cur_cu, cu_loc, lcu, &bits);
     }
-    if((cur_cu->log2_height + cur_cu->log2_width >= 6 || (x % 8 != 0 && y % 8 != 0) || tree_type == UVG_CHROMA_T) && state->encoder_control->chroma_format != UVG_CSP_400 && tree_type != UVG_LUMA_T) {
+    if((chroma_loc || tree_type == UVG_CHROMA_T) && state->encoder_control->chroma_format != UVG_CSP_400 && tree_type != UVG_LUMA_T) {
       encode_chroma_intra_cu(cabac, cur_cu, state->encoder_control->cfg.cclm, &bits);
     }
   }
diff --git a/src/encode_coding_tree.h b/src/encode_coding_tree.h
index 5a9b4023..96e0cfb7 100644
--- a/src/encode_coding_tree.h
+++ b/src/encode_coding_tree.h
@@ -77,6 +77,7 @@ double uvg_mock_encode_coding_unit(
   encoder_state_t* const state,
   cabac_data_t* cabac,
   const cu_loc_t* const cu_loc,
+  const cu_loc_t* const chroma_loc,
   lcu_t* lcu,
   cu_info_t* cur_cu,
   enum uvg_tree_type tree_type,
diff --git a/src/intra.c b/src/intra.c
index bb696c0c..883091c9 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -1121,6 +1121,7 @@ void uvg_intra_build_reference_any(
     else {
       const int num_cus = uvg_count_available_edge_cus(cu_loc, lcu, true);      
       px_available_left = is_dual_tree || !is_chroma ? num_cus * 4 : num_cus *2;
+      px_available_left -= px.x % 4;
     }
 
     // Limit the number of available pixels based on block size and dimensions
@@ -1440,7 +1441,7 @@ void uvg_intra_build_reference_inner(
   int i = multi_ref_index;  // Offset by multi_ref_index
   
   // Do different loop for heights smaller than 4 (possible for some ISP splits)
-  if (lcu_px.y % 4 != 0) {
+  if (px.y % 4 != 0) {
     do {
       out_left_ref[i + 1] = left_border[(i + 0 - multi_ref_index) * left_stride];
       out_left_ref[i + 2] = left_border[(i + 1 - multi_ref_index) * left_stride];
diff --git a/src/search.c b/src/search.c
index 58c190d0..19d79e07 100644
--- a/src/search.c
+++ b/src/search.c
@@ -188,8 +188,8 @@ static INLINE void copy_cu_coeffs(const cu_loc_t *cu_loc, lcu_t *from, lcu_t *to
 
   if (from->rec.chroma_format != UVG_CSP_400 && tree_type != UVG_LUMA_T) {
     //const int chroma_z = xy_to_zorder(LCU_WIDTH_C, cu_loc->x >> (tree_type != UVG_CHROMA_T), cu_loc->y >> (tree_type != UVG_CHROMA_T));
-    const int chroma_x = (cu_loc->x >> 1) & ~3;
-    const int chroma_y = (cu_loc->y >> 1) & ~3;
+    const int chroma_x = (cu_loc->x >> 1);
+    const int chroma_y = (cu_loc->y >> 1);
 
     const int idx = (chroma_x % LCU_WIDTH_C) + ((chroma_y % LCU_WIDTH_C) * LCU_WIDTH_C);
     copy_coeffs(&from->coeff.u[idx], &to->coeff.u[idx], cu_loc->chroma_width, cu_loc->chroma_height, LCU_WIDTH_C);
@@ -209,11 +209,16 @@ static void work_tree_copy_up(
   bool joint,
   enum
   uvg_tree_type tree_type,
-  const cu_loc_t* const cu_loc)
+  const cu_loc_t* const cu_loc,
+  const cu_loc_t* const chroma_loc)
 {
   copy_cu_info  (from, to, cu_loc, tree_type);
-  copy_cu_pixels(from, to, cu_loc, tree_type);
-  copy_cu_coeffs(cu_loc, from, to, joint, tree_type);  
+  copy_cu_pixels(from, to, cu_loc, cu_loc != chroma_loc && tree_type == UVG_LUMA_T ? UVG_LUMA_T : tree_type);
+  copy_cu_coeffs(cu_loc, from, to, joint, cu_loc != chroma_loc && tree_type == UVG_LUMA_T ? UVG_LUMA_T : tree_type);
+  if (cu_loc != chroma_loc && tree_type == UVG_LUMA_T) {
+    copy_cu_pixels(from, to, chroma_loc, UVG_CHROMA_T);
+    copy_cu_coeffs(chroma_loc, from, to, joint, UVG_CHROMA_T);
+  }
 }
 
 
@@ -482,7 +487,7 @@ double uvg_cu_rd_cost_chroma(
   lcu_t *const lcu,
   const cu_loc_t * const cu_loc)
 {
-  const vector2d_t lcu_px = { (cu_loc->local_x & ~7) / 2, (cu_loc->local_y & ~7) / 2 };
+  const vector2d_t lcu_px = { (cu_loc->local_x) / 2, (cu_loc->local_y) / 2 };
   cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y);
   const int skip_residual_coding = pred_cu->skipped || (pred_cu->type != CU_INTRA && pred_cu->cbf == 0);
   
@@ -787,7 +792,7 @@ static double cu_rd_cost_tr_split_accurate(
       if (chroma_can_use_tr_skip) {
         CABAC_FBITS_UPDATE(cabac, &cabac->ctx.transform_skip_model_chroma, tr_cu->tr_skip & 2, tr_tree_bits, "transform_skip_flag");
       }
-      coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.joint_uv, tr_cu, &chroma_loc, COLOR_U, scan_order, 0, COEFF_ORDER_CU);
+      coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.joint_uv, tr_cu, &temp_chroma_loc, COLOR_U, scan_order, 0, COEFF_ORDER_CU);
     }
   }
 
@@ -949,6 +954,7 @@ static double search_cu(
   const int y = cu_loc->y;
   const int luma_width = cu_loc->width;
   const int luma_height = cu_loc->height;
+  const bool is_separate_tree = chroma_loc == NULL || cu_loc->height != chroma_loc->height || cu_loc->width != chroma_loc->width;
   assert(cu_width >= 4);
   double cost = MAX_DOUBLE;
   double inter_zero_coeff_cost = MAX_DOUBLE;
@@ -1181,7 +1187,7 @@ static double search_cu(
 
       bool recon_chroma = true;
       bool recon_luma = tree_type != UVG_CHROMA_T;
-      if ((cur_cu->log2_height + cur_cu->log2_width < 6) || !has_chroma || state->encoder_control->chroma_format == UVG_CSP_400 || tree_type == UVG_LUMA_T) {
+      if (is_separate_tree || !has_chroma || state->encoder_control->chroma_format == UVG_CSP_400 || tree_type == UVG_LUMA_T) {
         recon_chroma = false; 
       }
       lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_height, cur_cu);
@@ -1192,7 +1198,7 @@ static double search_cu(
                          recon_luma, recon_chroma);
 
 
-      if((cur_cu->log2_height + cur_cu->log2_width < 6 && has_chroma && tree_type != UVG_LUMA_T && state->encoder_control->chroma_format != UVG_CSP_400 ) 
+      if((is_separate_tree && has_chroma && tree_type != UVG_LUMA_T && state->encoder_control->chroma_format != UVG_CSP_400 ) 
         || tree_type == UVG_CHROMA_T) {
         intra_search.pred_cu.intra.mode_chroma = cur_cu->intra.mode_chroma;
         uvg_intra_recon_cu(state,
@@ -1295,6 +1301,7 @@ static double search_cu(
       state,
       cabac,
       tree_type != UVG_CHROMA_T ? cu_loc : &separate_tree_chroma_loc,
+      is_separate_tree && !has_chroma ? NULL : chroma_loc,
       lcu,
       cur_cu,
       tree_type,
@@ -1338,7 +1345,7 @@ static double search_cu(
 
   // Recursively split all the way to max search depth.
   if (can_split_cu) {
-    const int split_type = depth == 2 ? BT_HOR_SPLIT : QT_SPLIT;
+    const int split_type = depth == 2 ? TT_VER_SPLIT : QT_SPLIT;
     const split_tree_t new_split = {
       split_tree.split_tree | split_type << (split_tree.current_depth * 3),
       split_tree.current_depth + 1,
@@ -1479,7 +1486,7 @@ static double search_cu(
     if (split_cost < cost) {
       // Copy split modes to this depth.
       cost = split_cost;
-      work_tree_copy_up(&split_lcu, lcu, state->encoder_control->cfg.jccr, tree_type, cu_loc);
+      work_tree_copy_up(&split_lcu, lcu, state->encoder_control->cfg.jccr, tree_type, cu_loc, is_separate_tree && !has_chroma ? NULL : chroma_loc);
 #if UVG_DEBUG
       //debug_split = 1;
 #endif
diff --git a/src/search_inter.c b/src/search_inter.c
index 4703152a..8d73cf04 100644
--- a/src/search_inter.c
+++ b/src/search_inter.c
@@ -2172,10 +2172,10 @@ void uvg_cu_cost_inter_rd2(
   const split_tree_t splitt_tree = { cur_cu->split_tree, depth, mtt_depth };
   if (cur_cu->merged) {
     no_cbf_bits = CTX_ENTROPY_FBITS(&state->cabac.ctx.cu_skip_flag_model[skip_context], 1) + *inter_bitcost;
-    bits += uvg_mock_encode_coding_unit(state, cabac, cu_loc, lcu, cur_cu, UVG_BOTH_T, splitt_tree);
+    bits += uvg_mock_encode_coding_unit(state, cabac, cu_loc, cu_loc, lcu, cur_cu, UVG_BOTH_T, splitt_tree);
   }
   else {
-    no_cbf_bits = uvg_mock_encode_coding_unit(state, cabac, cu_loc, lcu, cur_cu, UVG_BOTH_T, splitt_tree);
+    no_cbf_bits = uvg_mock_encode_coding_unit(state, cabac, cu_loc, cu_loc, lcu, cur_cu, UVG_BOTH_T, splitt_tree);
     bits += no_cbf_bits - CTX_ENTROPY_FBITS(&cabac->ctx.cu_qt_root_cbf_model, 0) + CTX_ENTROPY_FBITS(&cabac->ctx.cu_qt_root_cbf_model, 1);
   }
   double no_cbf_cost = ssd + no_cbf_bits * state->lambda;
diff --git a/src/search_intra.c b/src/search_intra.c
index 92c4903f..ff08b1f8 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -630,7 +630,7 @@ static int search_intra_chroma_rough(
 {
   const int_fast8_t log2_width_c = uvg_g_convert_to_log2[cu_loc->chroma_width];
   const vector2d_t pic_px = { state->tile->frame->width, state->tile->frame->height };
-  const vector2d_t luma_px = { cu_loc->x & ~7, cu_loc->y & ~7 };
+  const vector2d_t luma_px = { cu_loc->x, cu_loc->y};
   const int width = 1 << log2_width_c;
   const int height = width; // TODO: height for non-square blocks
 
@@ -642,7 +642,7 @@ static int search_intra_chroma_rough(
   uvg_intra_references refs_v;
   uvg_intra_build_reference(state, &loc, &loc, COLOR_V, &luma_px, &pic_px, lcu, &refs_v, state->encoder_control->cfg.wpp, NULL, 0, 0);
 
-  vector2d_t lcu_cpx = { (cu_loc->local_x & ~7) / 2, (cu_loc->local_y & ~7) / 2 };
+  vector2d_t lcu_cpx = { (cu_loc->local_x) / 2, (cu_loc->local_y) / 2 };
   uvg_pixel* orig_u = &lcu->ref.u[lcu_cpx.x + lcu_cpx.y * LCU_WIDTH_C];
   uvg_pixel* orig_v = &lcu->ref.v[lcu_cpx.x + lcu_cpx.y * LCU_WIDTH_C];
   
@@ -1429,7 +1429,7 @@ int8_t uvg_search_intra_chroma_rdo(
   const int chroma_width  = cu_loc->chroma_width;
   const int chroma_height = cu_loc->chroma_height;
   uvg_intra_references refs[2];
-  const vector2d_t luma_px = { cu_loc->x & ~7, cu_loc->y & ~7 };
+  const vector2d_t luma_px = { cu_loc->x, cu_loc->y };
   const vector2d_t pic_px = {
     state->tile->frame->width,
     state->tile->frame->height,
@@ -1444,7 +1444,7 @@ int8_t uvg_search_intra_chroma_rdo(
     cabac_data_t temp_cabac;
     memcpy(&temp_cabac, &state->search_cabac, sizeof(cabac_data_t));
     
-    const int offset = ((cu_loc->local_x & ~7) >> 1) + ((cu_loc->local_y & ~7) >> 1)* LCU_WIDTH_C;
+    const int offset = ((cu_loc->local_x) >> 1) + ((cu_loc->local_y) >> 1)* LCU_WIDTH_C;
 
     int lfnst_modes_to_check[3];
     if((is_separate || tree_type == UVG_CHROMA_T) && state->encoder_control->cfg.lfnst) {
diff --git a/src/strategies/generic/intra-generic.c b/src/strategies/generic/intra-generic.c
index 9a3cbe26..b7ab7e94 100644
--- a/src/strategies/generic/intra-generic.c
+++ b/src/strategies/generic/intra-generic.c
@@ -131,7 +131,7 @@ static void uvg_angular_pred_generic(
   const int_fast8_t mode_disp = vertical_mode ? pred_mode - 50 : -((int32_t)pred_mode - 18);
   
   // Sample displacement per column in fractions of 32.
-  const int_fast8_t sample_disp = (mode_disp < 0 ? -1 : 1) * modedisp2sampledisp[abs(mode_disp)];
+  const int16_t sample_disp = (mode_disp < 0 ? -1 : 1) * modedisp2sampledisp[abs(mode_disp)];
   
   const int side_size = vertical_mode ? log2_height : log2_width;
   int scale = MIN(2, side_size - pre_scale[abs(mode_disp)]);
@@ -248,7 +248,7 @@ static void uvg_angular_pred_generic(
 
      
       // PDPC
-      bool PDPC_filter = ((width >= TR_MIN_WIDTH && height >= TR_MIN_WIDTH)  || channel_type != 0) && multi_ref_index == 0;
+      bool PDPC_filter = (width >= TR_MIN_WIDTH && height >= TR_MIN_WIDTH) && multi_ref_index == 0;
       if (pred_mode > 1 && pred_mode < 67) {
         if (mode_disp < 0 || multi_ref_index) { // Cannot be used with MRL.
           PDPC_filter = false;
@@ -274,7 +274,7 @@ static void uvg_angular_pred_generic(
     
     // Do not apply PDPC if multi ref line index is other than 0
     // TODO: do not do PDPC if block is in BDPCM mode
-    bool do_pdpc = (((width >= 4 && height >= 4) || channel_type != 0) && sample_disp >= 0 && multi_ref_index == 0 /*&& !bdpcm*/);
+    bool do_pdpc = ((width >= 4 && height >= 4) && sample_disp >= 0 && multi_ref_index == 0 /*&& !bdpcm*/);
 
     if (do_pdpc) {
       int scale = (log2_width + log2_height - 2) >> 2;

From 13aae7d03d235be376f0373d38a1e071ab8024d5 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Fri, 25 Nov 2022 13:50:22 +0200
Subject: [PATCH 125/254] [mtt] All individual mtt splits should be working +
 uvg_get_possible_splits

---
 src/cu.c                 | 113 +++++++++++++++++++++++++++++++++++++++
 src/cu.h                 |   4 ++
 src/encode_coding_tree.c |  64 +++++++++++++---------
 src/encoderstate.c       |   2 +-
 src/intra.c              |  11 ++--
 src/intra.h              |   2 +-
 src/search.c             |  25 +++++----
 src/search_inter.c       |   2 +-
 src/search_intra.c       |  17 +++---
 src/search_intra.h       |   1 +
 src/transform.c          |   4 +-
 11 files changed, 191 insertions(+), 54 deletions(-)

diff --git a/src/cu.c b/src/cu.c
index b0cb2a63..10d8aabb 100644
--- a/src/cu.c
+++ b/src/cu.c
@@ -34,6 +34,9 @@
 #include <stdlib.h>
 
 #include "cu.h"
+
+#include "alf.h"
+#include "encoderstate.h"
 #include "threads.h"
 
 
@@ -365,6 +368,116 @@ int uvg_get_split_locs(
   return 0;
 }
 
+
+int uvg_get_implicit_split(const encoder_state_t* const state, const cu_loc_t* const cu_loc)
+{
+  bool right_ok = state->tile->frame->width >= cu_loc->x + cu_loc->width;
+  bool bottom_ok = state->tile->frame->height >= cu_loc->y + cu_loc->height;
+
+  if (right_ok && bottom_ok) return NO_SPLIT;
+  if (right_ok) return BT_HOR_SPLIT;
+  if (bottom_ok) return BT_VER_SPLIT;
+  return QT_SPLIT;
+}
+
+
+int uvg_get_possible_splits(const encoder_state_t * const state,
+  const cu_loc_t * const cu_loc, split_tree_t split_tree, enum uvg_tree_type tree_type, bool splits[6])
+{
+  const int width = tree_type != UVG_CHROMA_T ? cu_loc->width : cu_loc->chroma_width;
+  const int height = tree_type != UVG_CHROMA_T ? cu_loc->height : cu_loc->chroma_height;
+  const enum split_type implicitSplit = uvg_get_implicit_split(state, cu_loc);
+  const int slice_type = state->frame->is_irap ? (tree_type == UVG_CHROMA_T ? 2 : 0) : 1;
+
+  const unsigned max_btd = state->encoder_control->cfg.max_btt_depth[slice_type]; // +currImplicitBtDepth;
+  const unsigned max_bt_size = state->encoder_control->cfg.max_bt_size[slice_type];
+  const unsigned min_bt_size = 1 << MIN_SIZE;
+  const unsigned max_tt_size = state->encoder_control->cfg.max_tt_size[slice_type];
+  const unsigned min_tt_size = 1 << MIN_SIZE;
+  const unsigned min_qt_size = state->encoder_control->cfg.min_qt_size[slice_type];
+  
+  splits[NO_SPLIT] = splits[QT_SPLIT] = splits[BT_HOR_SPLIT] = splits[TT_HOR_SPLIT] = splits[BT_VER_SPLIT] = splits[TT_VER_SPLIT] = true;
+  bool can_btt = split_tree.mtt_depth < max_btd;
+  
+  const enum split_type last_split = (split_tree.split_tree >> (split_tree.current_depth * 3)) & 7;
+  const enum split_type parl_split = last_split == BT_HOR_SPLIT ? BT_HOR_SPLIT : BT_VER_SPLIT;
+
+  // don't allow QT-splitting below a BT split
+  if (split_tree.current_depth != 0 && last_split != QT_SPLIT && (width > 64 || height > 64)) splits[QT_SPLIT] = false;
+  if (width <= min_qt_size)                              splits[QT_SPLIT] = false;
+
+  if (tree_type == UVG_CHROMA_T && width <= 4) splits[QT_SPLIT] = false;
+  if (tree_type == UVG_CHROMA_T)
+  {
+    splits[QT_SPLIT] = splits[BT_VER_SPLIT] = splits[TT_HOR_SPLIT] = splits[BT_VER_SPLIT] = splits[TT_VER_SPLIT] = false;
+    return;
+  }
+  if (implicitSplit != NO_SPLIT)
+  {
+    splits[NO_SPLIT] = splits[TT_HOR_SPLIT] = splits[TT_VER_SPLIT] = false;
+
+    splits[BT_HOR_SPLIT] = implicitSplit == BT_HOR_SPLIT;
+    splits[BT_VER_SPLIT] = implicitSplit == BT_VER_SPLIT;
+    if (tree_type == UVG_CHROMA_T && width == 4) splits[BT_VER_SPLIT] = false;
+    if (!splits[BT_HOR_SPLIT] && !splits[BT_VER_SPLIT] && !splits[QT_SPLIT]) splits[QT_SPLIT] = true;
+    return 1;
+  }
+
+  if ((last_split == TT_HOR_SPLIT || last_split == TT_VER_SPLIT) && split_tree.part_index == 1)
+  {
+    splits[BT_HOR_SPLIT] = parl_split != BT_HOR_SPLIT;
+    splits[BT_VER_SPLIT] = parl_split != BT_VER_SPLIT;
+  }
+
+  if (can_btt && (width <= min_bt_size && height <= min_bt_size)
+    && ((width <= min_tt_size && height <= min_tt_size)))
+  {
+    can_btt = false;
+  }
+  if (can_btt && (width > max_bt_size || height > max_bt_size)
+    && ((width > max_tt_size || height > max_tt_size)))
+  {
+    can_btt = false;
+  }
+
+  if (!can_btt)
+  {
+    splits[BT_HOR_SPLIT] = splits[TT_HOR_SPLIT] = splits[BT_VER_SPLIT] = splits[TT_VER_SPLIT] = false;
+
+    return;
+  }
+
+  if (width > max_bt_size || height > max_bt_size)
+  {
+    splits[BT_HOR_SPLIT] = splits[BT_VER_SPLIT] = false;
+  }
+
+  // specific check for BT splits
+  if (height <= min_bt_size)                            splits[BT_HOR_SPLIT] = false;
+  if (width > 64 && height <= 64) splits[BT_HOR_SPLIT] = false;
+  if (tree_type == UVG_CHROMA_T && width * height <= 16)     splits[BT_HOR_SPLIT] = false;
+
+  if (width <= min_bt_size)                              splits[BT_VER_SPLIT] = false;
+  if (width <= 64 && height > 64) splits[BT_VER_SPLIT] = false;
+  if (tree_type == UVG_CHROMA_T && (width * height <= 16 || width == 4))     splits[BT_VER_SPLIT] = false;
+
+  //if (modeType == MODE_TYPE_INTER && width * height == 32)  splits[BT_VER_SPLIT] = splits[BT_HOR_SPLIT] = false;
+
+  if (height <= 2 * min_tt_size || height > max_tt_size || width > max_tt_size)
+    splits[TT_HOR_SPLIT] = false;
+  if (width > 64 || height > 64)  splits[TT_HOR_SPLIT] = false;
+  if (tree_type == UVG_CHROMA_T && width * height <= 16 * 2)     splits[TT_HOR_SPLIT] = false;
+
+  if (width <= 2 * min_tt_size || width > max_tt_size || height > max_tt_size)
+    splits[TT_VER_SPLIT] = false;
+  if (width > 64 || height > 64)  splits[TT_VER_SPLIT] = false;
+  if (tree_type == UVG_CHROMA_T && (width * height <= 16 * 2 || width == 8))     splits[TT_VER_SPLIT] = false;
+
+  //if (modeType == MODE_TYPE_INTER && width * height == 64)  splits[TT_VER_SPLIT] = splits[TT_HOR_SPLIT] = false;
+  return 0;
+}
+
+
 int uvg_count_available_edge_cus(const cu_loc_t* const cu_loc, const lcu_t* const lcu, bool left)
 {
   if ((left && cu_loc->x == 0) || (!left && cu_loc->y == 0)) {
diff --git a/src/cu.h b/src/cu.h
index 48a021c3..5855eaed 100644
--- a/src/cu.h
+++ b/src/cu.h
@@ -106,6 +106,7 @@ typedef struct  {
   uint32_t split_tree;
   uint8_t current_depth;
   uint8_t mtt_depth;
+  uint8_t part_index;
 } split_tree_t;
 
 
@@ -185,12 +186,15 @@ typedef struct {
 } cu_loc_t;
 
 void uvg_cu_loc_ctor(cu_loc_t *loc, int x, int y, int width, int height);
+typedef struct encoder_state_t encoder_state_t;
 
 int uvg_get_split_locs(
   const cu_loc_t* const origin,
   enum split_type split,
   cu_loc_t out[4],
   uint8_t* separate_chroma);
+int uvg_get_possible_splits(const encoder_state_t* const state,
+  const cu_loc_t* const cu_loc, split_tree_t split_tree, enum uvg_tree_type tree_type, bool splits[6]);
 
 
 #define CU_GET_MV_CAND(cu_info_ptr, reflist) \
diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index ce47f566..54a7a639 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -508,7 +508,8 @@ static void encode_transform_unit(
   bool only_chroma,
   enum uvg_tree_type tree_type,
   bool last_split,
-  const cu_loc_t *original_loc)               // Original cu dimensions, before CU split
+  const cu_loc_t *original_loc,
+  const cu_loc_t* const chroma_loc)               // Original cu dimensions, before CU split
 {
   const videoframe_t * const frame = state->tile->frame;
   cabac_data_t* const cabac = &state->cabac;
@@ -581,10 +582,10 @@ static void encode_transform_unit(
 
   bool chroma_cbf_set = cbf_is_set(cur_pu->cbf, COLOR_U) ||
                         cbf_is_set(cur_pu->cbf, COLOR_V);
-  if ((chroma_cbf_set || joint_chroma) && last_split) {
+  if ((chroma_cbf_set || joint_chroma) && last_split && chroma_loc) {
     //Need to drop const to get lfnst constraints
     // Use original dimensions instead of ISP split dimensions
-    encode_chroma_tu(state, original_loc, (cu_info_t*)cur_pu, &scan_idx, coeff, joint_chroma, tree_type);
+    encode_chroma_tu(state, chroma_loc, (cu_info_t*)cur_pu, &scan_idx, coeff, joint_chroma, tree_type);
   }
 }
 
@@ -605,8 +606,10 @@ static void encode_transform_coeff(
   enum uvg_tree_type tree_type,
   bool last_split,
   bool can_skip_last_cbf,
-  int *luma_cbf_ctx,            // Always true except when writing sub partition coeffs (ISP)
-  const cu_loc_t * const original_loc)       // Original dimensions before ISP split
+  int *luma_cbf_ctx,
+  // Always true except when writing sub partition coeffs (ISP)
+  const cu_loc_t * const original_loc,
+  const cu_loc_t* const chroma_loc)       // Original dimensions before ISP split
 {
   cabac_data_t * const cabac = &state->cabac;
 
@@ -647,7 +650,8 @@ static void encode_transform_coeff(
     cu_loc_t split_cu_loc[4];
     const int split_count = uvg_get_split_locs(cu_loc, split, split_cu_loc,NULL);
     for (int i = 0; i < split_count; ++i) {
-      encode_transform_coeff(state, &split_cu_loc[i], only_chroma, coeff, NULL, tree_type, true, false, luma_cbf_ctx, &split_cu_loc[i]);
+      encode_transform_coeff(state, &split_cu_loc[i], only_chroma,
+        coeff, NULL, tree_type, true, false, luma_cbf_ctx, &split_cu_loc[i], &split_cu_loc[i]);
     }
     return;
   }
@@ -658,7 +662,7 @@ static void encode_transform_coeff(
   // Not the last CU for area of 64 pixels cowered by more than one luma CU.
   // Not the last ISP Split
   if (state->encoder_control->chroma_format != UVG_CSP_400
-    && (cur_tu->log2_height + cur_tu->log2_width >= 6 || only_chroma)
+    && (chroma_loc || only_chroma)
     && tree_type != UVG_LUMA_T
     && last_split) {
     cabac->cur_ctx = &(cabac->ctx.qt_cbf_model_cb[0]);
@@ -684,7 +688,7 @@ static void encode_transform_coeff(
   }
 
   if (cb_flag_y | cb_flag_u | cb_flag_v) {
-    if (state->must_code_qp_delta && (only_chroma || cb_flag_y || cur_tu->log2_height + cur_tu->log2_width >= 6) ) {
+    if (state->must_code_qp_delta && (only_chroma || cb_flag_y || chroma_loc) ) {
       const int qp_pred      = uvg_get_cu_ref_qp(state, cu_loc->x, cu_loc->y, state->last_qp);
       const int qp_delta     = cur_tu->qp - qp_pred;
       // Possible deltaQP range depends on bit depth as stated in HEVC specification.
@@ -711,7 +715,7 @@ static void encode_transform_coeff(
         ((cb_flag_u || cb_flag_v ) 
           && cur_tu->type == CU_INTRA)
         || (cb_flag_u && cb_flag_v)) 
-      && (cur_tu->log2_height + cur_tu->log2_width >= 6 || only_chroma || tree_type == UVG_CHROMA_T)
+      && (chroma_loc || only_chroma || tree_type == UVG_CHROMA_T)
       && state->encoder_control->cfg.jccr
       && last_split
       ) {
@@ -720,7 +724,7 @@ static void encode_transform_coeff(
       CABAC_BIN(cabac, cur_tu->joint_cb_cr != 0, "tu_joint_cbcr_residual_flag");
     }
 
-    encode_transform_unit(state, cu_loc, only_chroma ? cur_tu : NULL, coeff, only_chroma, tree_type, last_split, original_loc);
+    encode_transform_unit(state, cu_loc, only_chroma ? cur_tu : NULL, coeff, only_chroma, tree_type, last_split, original_loc, chroma_loc);
   }
 }
 
@@ -855,14 +859,14 @@ int uvg_encode_inter_prediction_unit(
 }
 
 static void encode_chroma_intra_cu(
-  cabac_data_t* const cabac, 
-  const cu_info_t* const cur_cu, 
-  const int cclm_enabled, 
+  cabac_data_t* const cabac,
+  const cu_info_t* const cur_cu,
+  const int cclm_enabled,
+  int8_t luma_intra_dir,
   double* bits_out) {
   unsigned pred_mode = 0;
   unsigned chroma_pred_modes[8] = {0, 50, 18, 1, 67, 81, 82, 83};
   int8_t chroma_intra_dir = cur_cu->intra.mode_chroma;
-  int8_t luma_intra_dir = !cur_cu->intra.mip_flag ? cur_cu->intra.mode : 0;
   for(int i = 0; i < 4; i++) {
     if(chroma_pred_modes[i] == luma_intra_dir) {
       chroma_pred_modes[i] = 66;
@@ -1399,12 +1403,13 @@ void uvg_encode_coding_tree(
       NULL);
     
     if (split_flag || border) {
-      const split_tree_t new_split_tree = { cur_cu->split_tree, split_tree.current_depth + 1, split_tree.mtt_depth + (split_flag != QT_SPLIT)};
+      split_tree_t new_split_tree = { cur_cu->split_tree, split_tree.current_depth + 1, split_tree.mtt_depth + (split_flag != QT_SPLIT), 0};
 
       cu_loc_t new_cu_loc[4];
       uint8_t separate_chroma = 0;
       const int splits = uvg_get_split_locs(cu_loc, split_flag, new_cu_loc, &separate_chroma);
       for (int split = 0; split <splits; ++split) {
+        new_split_tree.part_index = split;
         uvg_encode_coding_tree(state, coeff, tree_type,
           &new_cu_loc[split], separate_chroma ? chroma_loc : &new_cu_loc[split],
           new_split_tree, !separate_chroma || split == splits - 1);
@@ -1586,7 +1591,7 @@ void uvg_encode_coding_tree(
       // Code (possible) coeffs to bitstream
       if (has_coeffs) {
         int luma_cbf_ctx = 0;
-        encode_transform_coeff(state, cu_loc, 0, coeff, cur_cu, tree_type, true, false, &luma_cbf_ctx, cu_loc);
+        encode_transform_coeff(state, cu_loc, 0, coeff, cur_cu, tree_type, true, false, &luma_cbf_ctx, cu_loc, cu_loc);
       }
 
       encode_mts_idx(state, cabac, cur_cu, cu_loc);
@@ -1596,12 +1601,14 @@ void uvg_encode_coding_tree(
     if(tree_type != UVG_CHROMA_T) {
       uvg_encode_intra_luma_coding_unit(state, cabac, cur_cu, cu_loc, NULL, NULL);
     }
-
-    const bool is_local_dual_tree = cu_height * cu_width < 64 && tree_type == UVG_BOTH_T;
+    
+    const bool is_local_dual_tree = (chroma_loc->width != cu_loc->width || chroma_loc->height != cu_loc->height);
 
     // Code chroma prediction mode.
-    if (state->encoder_control->chroma_format != UVG_CSP_400 && cur_cu->log2_height + cur_cu->log2_width >= 6 && tree_type == UVG_BOTH_T) {
-      encode_chroma_intra_cu(cabac, cur_cu, state->encoder_control->cfg.cclm, NULL);
+    if (state->encoder_control->chroma_format != UVG_CSP_400 
+      && (chroma_loc->width == cu_loc->width && chroma_loc->height == cu_loc->height) 
+      && tree_type == UVG_BOTH_T) {
+      encode_chroma_intra_cu(cabac, cur_cu, state->encoder_control->cfg.cclm, !cur_cu->intra.mip_flag ? cur_cu->intra.mode : 0, NULL);
     }
     int luma_cbf_ctx = 0;
 
@@ -1620,7 +1627,9 @@ void uvg_encode_coding_tree(
 
         // Check if last split to write chroma
         bool last_split = (i + 1) == split_limit;
-        encode_transform_coeff(state, &split_loc, 0, coeff, NULL, tree_type, last_split, can_skip_last_cbf, &luma_cbf_ctx, cu_loc);
+        encode_transform_coeff(state, &split_loc,
+          0, coeff, NULL, tree_type, last_split, can_skip_last_cbf, &luma_cbf_ctx, 
+          cu_loc, is_local_dual_tree ? NULL : chroma_loc);
       }
     }
 
@@ -1631,16 +1640,17 @@ void uvg_encode_coding_tree(
 
     // For 4x4 the chroma PU/TU is coded after the last 
     if (state->encoder_control->chroma_format != UVG_CSP_400 &&
-      (((chroma_loc->width != cu_loc->width || chroma_loc->height != cu_loc->height)&&
+      ((is_local_dual_tree &&
       has_chroma) || tree_type == UVG_CHROMA_T) &&
       tree_type != UVG_LUMA_T)   {
-      encode_chroma_intra_cu(cabac, cur_cu, state->encoder_control->cfg.cclm, NULL);
+      int8_t luma_dir = uvg_get_co_located_luma_mode(chroma_loc->x, chroma_loc->y, chroma_loc->width, chroma_loc->height, NULL, frame->cu_array, UVG_CHROMA_T);
+      encode_chroma_intra_cu(cabac, cur_cu, state->encoder_control->cfg.cclm, luma_dir,NULL);
       // LFNST constraints must be reset here. Otherwise the left over values will interfere when calculating new constraints
       cu_info_t* tmp = (cu_info_t*)cur_cu;
       tmp->violates_lfnst_constrained_luma = false;
       tmp->violates_lfnst_constrained_chroma = false;
       tmp->lfnst_last_scan_pos = false;
-      encode_transform_coeff(state, chroma_loc, 1, coeff, cur_cu, tree_type, true, false, &luma_cbf_ctx, chroma_loc);
+      encode_transform_coeff(state, chroma_loc, 1, coeff, cur_cu, tree_type, true, false, &luma_cbf_ctx, chroma_loc, chroma_loc);
       // Write LFNST only once for single tree structure
       encode_lfnst_idx(state, cabac, tmp, is_local_dual_tree ? UVG_CHROMA_T : tree_type, COLOR_UV, chroma_loc);
     }
@@ -1683,6 +1693,7 @@ double uvg_mock_encode_coding_unit(
 
   int x_local = cu_loc->local_x >> (tree_type == UVG_CHROMA_T);
   int y_local = cu_loc->local_y >> (tree_type == UVG_CHROMA_T);
+  const bool is_separate_tree = chroma_loc == NULL || cu_loc->height != chroma_loc->height || cu_loc->width != chroma_loc->width;
     
   const cu_info_t* left_cu = NULL, *above_cu = NULL;
   if (x) {
@@ -1782,7 +1793,10 @@ double uvg_mock_encode_coding_unit(
       uvg_encode_intra_luma_coding_unit(state, cabac, cur_cu, cu_loc, lcu, &bits);
     }
     if((chroma_loc || tree_type == UVG_CHROMA_T) && state->encoder_control->chroma_format != UVG_CSP_400 && tree_type != UVG_LUMA_T) {
-      encode_chroma_intra_cu(cabac, cur_cu, state->encoder_control->cfg.cclm, &bits);
+      int8_t luma_dir = uvg_get_co_located_luma_mode(chroma_loc->x, chroma_loc->y, chroma_loc->width, chroma_loc->height,
+        tree_type != UVG_CHROMA_T ? lcu : NULL,
+        tree_type == UVG_CHROMA_T ? state->tile->frame->cu_array : NULL,   is_separate_tree ? UVG_CHROMA_T : tree_type);
+      encode_chroma_intra_cu(cabac, cur_cu, state->encoder_control->cfg.cclm, luma_dir, &bits);
     }
   }
   else {
diff --git a/src/encoderstate.c b/src/encoderstate.c
index 6c7517d8..5931f8d0 100644
--- a/src/encoderstate.c
+++ b/src/encoderstate.c
@@ -883,7 +883,7 @@ static void encoder_state_worker_encode_lcu_bitstream(void * opaque)
   //Encode coding tree
   cu_loc_t start;
   uvg_cu_loc_ctor(&start, lcu->position.x * LCU_WIDTH, lcu->position.y * LCU_WIDTH, LCU_WIDTH, LCU_WIDTH);
-  split_tree_t split_tree = { 0, 0, 0 };
+  split_tree_t split_tree = { 0, 0, 0, 0 };
 
   uvg_encode_coding_tree(state, lcu->coeff, tree_type, &start, &start, split_tree, true);
 
diff --git a/src/intra.c b/src/intra.c
index 883091c9..bcc04359 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -1602,7 +1602,7 @@ void uvg_intra_predict(
 }
 
 // This function works on luma coordinates 
-const cu_info_t* uvg_get_co_located_luma_cu(
+int8_t uvg_get_co_located_luma_mode(
   int x,
   int y,
   int width,
@@ -1617,12 +1617,17 @@ const cu_info_t* uvg_get_co_located_luma_cu(
     x += width >> 1;
     y += height >> 1;
   }
+  const cu_info_t* cu;
   if(cu_array) {
-    return uvg_cu_array_at_const(cu_array, x, y);
+    cu = uvg_cu_array_at_const(cu_array, x, y);
   }
   else {
-    return LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y));
+    cu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y));
   }
+  if (cu->intra.mip_flag) {
+    return 0;
+  }
+  return cu->intra.mode;
 }
 
 
diff --git a/src/intra.h b/src/intra.h
index 022b8ce1..9f2986eb 100644
--- a/src/intra.h
+++ b/src/intra.h
@@ -151,7 +151,7 @@ void uvg_intra_recon_cu(
   bool recon_luma,
   bool recon_chroma);
 
-const cu_info_t* uvg_get_co_located_luma_cu(
+int8_t uvg_get_co_located_luma_mode(
   int x,
   int y,
   int width,
diff --git a/src/search.c b/src/search.c
index 19d79e07..6a10a83e 100644
--- a/src/search.c
+++ b/src/search.c
@@ -1103,17 +1103,20 @@ static double search_cu(
         int8_t intra_mode = intra_search.pred_cu.intra.mode;
         
         if ((has_chroma || tree_type == UVG_CHROMA_T)
-          && state->encoder_control->chroma_format != UVG_CSP_400 && tree_type != UVG_LUMA_T) {
+          && state->encoder_control->chroma_format != UVG_CSP_400) {
 
           intra_search.pred_cu.joint_cb_cr = 0;
-          if(tree_type == UVG_CHROMA_T) {
-            intra_search.pred_cu.intra = uvg_get_co_located_luma_cu(x, y, luma_width, luma_width, NULL, state->tile->frame->cu_array, UVG_CHROMA_T)->intra;
-            intra_mode = intra_search.pred_cu.intra.mode;
+          if(tree_type == UVG_CHROMA_T || is_separate_tree) {
+            intra_mode = uvg_get_co_located_luma_mode(chroma_loc->x, chroma_loc->y, chroma_loc->width, chroma_loc->height,
+                                                    is_separate_tree ? lcu : NULL,
+                                                    tree_type == UVG_CHROMA_T ? state->tile->frame->cu_array : NULL, UVG_CHROMA_T);
             intra_search.pred_cu.type = CU_INTRA;
+          } else  if (intra_search.pred_cu.intra.mip_flag) {
+            intra_mode = 0;
           }
-          intra_search.pred_cu.intra.mode_chroma = intra_search.pred_cu.intra.mode;
+          intra_search.pred_cu.intra.mode_chroma = intra_mode;
           if (ctrl->cfg.rdo >= 2 || ctrl->cfg.jccr || ctrl->cfg.lfnst) {
-            uvg_search_cu_intra_chroma(state, chroma_loc, lcu, &intra_search, tree_type, cu_loc->x != chroma_loc->x || cu_loc->y != chroma_loc->y);
+            uvg_search_cu_intra_chroma(state, chroma_loc, lcu, &intra_search, intra_mode, tree_type, is_separate_tree);
 
             if (intra_search.pred_cu.joint_cb_cr == 0) {
               intra_search.pred_cu.joint_cb_cr = 4;
@@ -1121,7 +1124,7 @@ static double search_cu(
 
           }
           else if (!intra_search.pred_cu.intra.mip_flag) {
-            intra_search.pred_cu.intra.mode_chroma = intra_search.pred_cu.intra.mode;
+            intra_search.pred_cu.intra.mode_chroma = intra_mode;
           }
           else {
             intra_search.pred_cu.intra.mode_chroma = 0;
@@ -1138,14 +1141,12 @@ static double search_cu(
           else {
             intra_cost = intra_search.cost;
           }
-          intra_search.pred_cu.intra.mode = intra_mode;
           intra_search.pred_cu.violates_lfnst_constrained_chroma = false;
           intra_search.pred_cu.lfnst_last_scan_pos = false;
         }
         else {
           intra_search.pred_cu.intra.mode_chroma = intra_mode;
         }
-        intra_search.pred_cu.intra.mode = intra_mode;
       }
       if (intra_cost < cost) {
         cost = intra_cost;
@@ -1207,6 +1208,8 @@ static double search_cu(
                            tree_type,
                            false,
                            true);
+      } else {
+        assert(cur_cu->cr_lfnst_idx == 0 && "If we don't have separate tree chroma lfnst index must be 0");
       }
       if (cur_cu->joint_cb_cr == 4) cur_cu->joint_cb_cr = 0;
 
@@ -1346,10 +1349,11 @@ static double search_cu(
   // Recursively split all the way to max search depth.
   if (can_split_cu) {
     const int split_type = depth == 2 ? TT_VER_SPLIT : QT_SPLIT;
-    const split_tree_t new_split = {
+    split_tree_t new_split = {
       split_tree.split_tree | split_type << (split_tree.current_depth * 3),
       split_tree.current_depth + 1,
       split_tree.mtt_depth + (split_type != QT_SPLIT),
+      0
     };
     
     double split_cost = 0.0;
@@ -1408,6 +1412,7 @@ static double search_cu(
       const int splits = uvg_get_split_locs(cu_loc, split_type, new_cu_loc, &separate_chroma);
       initialize_partial_work_tree(lcu, &split_lcu, cu_loc, tree_type);
       for (int split = 0; split < splits; ++split) {
+        new_split.part_index = split;
         split_cost += search_cu(state, 
           &new_cu_loc[split], separate_chroma ? chroma_loc : &new_cu_loc[split],
           &split_lcu, 
diff --git a/src/search_inter.c b/src/search_inter.c
index 8d73cf04..92a62795 100644
--- a/src/search_inter.c
+++ b/src/search_inter.c
@@ -2169,7 +2169,7 @@ void uvg_cu_cost_inter_rd2(
     depth++;
     splits >>= 3;
   }
-  const split_tree_t splitt_tree = { cur_cu->split_tree, depth, mtt_depth };
+  const split_tree_t splitt_tree = { cur_cu->split_tree, depth, mtt_depth, 0};
   if (cur_cu->merged) {
     no_cbf_bits = CTX_ENTROPY_FBITS(&state->cabac.ctx.cu_skip_flag_model[skip_context], 1) + *inter_bitcost;
     bits += uvg_mock_encode_coding_unit(state, cabac, cu_loc, cu_loc, lcu, cur_cu, UVG_BOTH_T, splitt_tree);
diff --git a/src/search_intra.c b/src/search_intra.c
index ff08b1f8..1ed00943 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -1452,11 +1452,6 @@ int8_t uvg_search_intra_chroma_rdo(
         lfnst_modes_to_check[i] = i;
       }
     }
-    else if(chroma_data->pred_cu.lfnst_idx) {
-      lfnst_modes_to_check[0] = chroma_data->pred_cu.lfnst_idx;
-      lfnst_modes_to_check[1] = -1;
-      lfnst_modes_to_check[2] = -1;
-    }
     else {
       lfnst_modes_to_check[0] = 0;
       lfnst_modes_to_check[1] = -1;
@@ -1591,17 +1586,17 @@ int8_t uvg_search_cu_intra_chroma(
   const cu_loc_t* const cu_loc,
   lcu_t *lcu,
   intra_search_data_t *search_data,
+  int8_t luma_mode,
   enum uvg_tree_type tree_type,
   bool is_separate)
 {
 
   const cu_info_t *cur_pu = &search_data->pred_cu;
-  int8_t intra_mode = !cur_pu->intra.mip_flag ? cur_pu->intra.mode : 0;
   
-  int8_t modes[8] = { 0, 50, 18, 1, intra_mode, 81, 82, 83 };
+  int8_t modes[8] = { 0, 50, 18, 1, luma_mode, 81, 82, 83 };
   uint8_t total_modes = (state->encoder_control->cfg.cclm ? 8 : 5);
   for(int i = 0; i < 4; i++) {
-    if (modes[i] == intra_mode) {
+    if (modes[i] == luma_mode) {
       modes[i] = 66;
       break;
     }
@@ -1623,7 +1618,7 @@ int8_t uvg_search_cu_intra_chroma(
   FILL(chroma_data, 0);
   for (int i = 0; i < num_modes; i++) {
     chroma_data[i].pred_cu = *cur_pu;
-    chroma_data[i].pred_cu.intra.mode_chroma = num_modes == 1 ? intra_mode : modes[i];
+    chroma_data[i].pred_cu.intra.mode_chroma = num_modes == 1 ? luma_mode : modes[i];
     chroma_data[i].cost = 0;
     if(cu_loc->width != 4 && tree_type == UVG_BOTH_T) {
       memcpy(chroma_data[i].lfnst_costs, search_data->lfnst_costs, sizeof(double) * 3);
@@ -1636,13 +1631,13 @@ int8_t uvg_search_cu_intra_chroma(
   if(state->encoder_control->cfg.cclm && 0){
     
 
-    num_modes = search_intra_chroma_rough(state, chroma_data, lcu, intra_mode,
+    num_modes = search_intra_chroma_rough(state, chroma_data, lcu, luma_mode,
                                           tree_type,
                                           cu_loc);
   }
   
   if (num_modes > 1 || state->encoder_control->cfg.jccr) {
-    uvg_search_intra_chroma_rdo(state, num_modes, lcu, cu_loc, chroma_data, intra_mode, tree_type, is_separate);
+    uvg_search_intra_chroma_rdo(state, num_modes, lcu, cu_loc, chroma_data, luma_mode, tree_type, is_separate);
   }
   else if(cur_pu->lfnst_idx) {
     chroma_data[0].pred_cu.cr_lfnst_idx = cur_pu->lfnst_idx;
diff --git a/src/search_intra.h b/src/search_intra.h
index 390187b2..ebcec26e 100644
--- a/src/search_intra.h
+++ b/src/search_intra.h
@@ -55,6 +55,7 @@ int8_t uvg_search_cu_intra_chroma(
   const cu_loc_t* const cu_loc,
   lcu_t *lcu,
   intra_search_data_t* best_cclm,
+  int8_t luma_mode,
   enum uvg_tree_type tree_type,
   bool is_separate);
 
diff --git a/src/transform.c b/src/transform.c
index 34514e82..26851b8d 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -872,7 +872,7 @@ void uvg_fwd_lfnst(
   
   const int scan_order = SCAN_DIAG;
 
-  if (lfnst_index && !mts_skip && (is_separate_tree || color == COLOR_Y))
+  if (lfnst_index && !mts_skip)
   {
     assert(log2_width != -1 && "LFNST: invalid block width.");
     const bool whge3 = width >= 8 && height >= 8;
@@ -1005,7 +1005,7 @@ void uvg_inv_lfnst(
   bool is_mip = block_is_mip(cur_cu, color, is_separate_tree);
   const int scan_order = SCAN_DIAG;
   
-  if (lfnst_index && !mts_skip && (is_separate_tree || color == COLOR_Y)) {
+  if (lfnst_index && !mts_skip) {
     const bool whge3 = width >= 8 && height >= 8;
     const uint32_t* scan = whge3 ? uvg_coef_top_left_diag_scan_8x8[log2_width] : uvg_g_sig_last_scan[scan_order][log2_width - 1];
     

From 657254d38a593b4bb9c9d8dedfd78b767b5e8cb1 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Mon, 28 Nov 2022 11:24:55 +0200
Subject: [PATCH 126/254] [mtt] search with depth 1 mtt kinda working

---
 src/cu.c                 |   5 +-
 src/encode_coding_tree.c |   8 +-
 src/intra.c              |  18 ++--
 src/intra.h              |   7 +-
 src/search.c             | 180 ++++++++++++++++++++++-----------------
 5 files changed, 122 insertions(+), 96 deletions(-)

diff --git a/src/cu.c b/src/cu.c
index 10d8aabb..3eb7a771 100644
--- a/src/cu.c
+++ b/src/cu.c
@@ -399,11 +399,11 @@ int uvg_get_possible_splits(const encoder_state_t * const state,
   splits[NO_SPLIT] = splits[QT_SPLIT] = splits[BT_HOR_SPLIT] = splits[TT_HOR_SPLIT] = splits[BT_VER_SPLIT] = splits[TT_VER_SPLIT] = true;
   bool can_btt = split_tree.mtt_depth < max_btd;
   
-  const enum split_type last_split = (split_tree.split_tree >> (split_tree.current_depth * 3)) & 7;
+  const enum split_type last_split = (split_tree.split_tree >> (split_tree.current_depth * 3 - 3)) & 7;
   const enum split_type parl_split = last_split == BT_HOR_SPLIT ? BT_HOR_SPLIT : BT_VER_SPLIT;
 
   // don't allow QT-splitting below a BT split
-  if (split_tree.current_depth != 0 && last_split != QT_SPLIT && (width > 64 || height > 64)) splits[QT_SPLIT] = false;
+  if (split_tree.current_depth != 0 && last_split != QT_SPLIT /* && !(width > 64 || height > 64)*/) splits[QT_SPLIT] = false;
   if (width <= min_qt_size)                              splits[QT_SPLIT] = false;
 
   if (tree_type == UVG_CHROMA_T && width <= 4) splits[QT_SPLIT] = false;
@@ -488,6 +488,7 @@ int uvg_count_available_edge_cus(const cu_loc_t* const cu_loc, const lcu_t* cons
 
   int amount = 0;
   if(left) {
+    if (cu_loc->local_y == 0 && cu_loc->local_x == 32 && cu_loc->height == 32 && cu_loc->width == 32) return 8;
     while (LCU_GET_CU_AT_PX(lcu, cu_loc->local_x - TR_MIN_WIDTH, cu_loc->local_y + amount)->type != CU_NOTSET && (cu_loc->local_y + amount) < LCU_WIDTH) {
       amount += TR_MIN_WIDTH;
     }
diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index 54a7a639..999da9a3 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -1643,7 +1643,7 @@ void uvg_encode_coding_tree(
       ((is_local_dual_tree &&
       has_chroma) || tree_type == UVG_CHROMA_T) &&
       tree_type != UVG_LUMA_T)   {
-      int8_t luma_dir = uvg_get_co_located_luma_mode(chroma_loc->x, chroma_loc->y, chroma_loc->width, chroma_loc->height, NULL, frame->cu_array, UVG_CHROMA_T);
+      int8_t luma_dir = uvg_get_co_located_luma_mode(chroma_loc, cu_loc, cur_cu, NULL, frame->cu_array, UVG_CHROMA_T);
       encode_chroma_intra_cu(cabac, cur_cu, state->encoder_control->cfg.cclm, luma_dir,NULL);
       // LFNST constraints must be reset here. Otherwise the left over values will interfere when calculating new constraints
       cu_info_t* tmp = (cu_info_t*)cur_cu;
@@ -1793,9 +1793,9 @@ double uvg_mock_encode_coding_unit(
       uvg_encode_intra_luma_coding_unit(state, cabac, cur_cu, cu_loc, lcu, &bits);
     }
     if((chroma_loc || tree_type == UVG_CHROMA_T) && state->encoder_control->chroma_format != UVG_CSP_400 && tree_type != UVG_LUMA_T) {
-      int8_t luma_dir = uvg_get_co_located_luma_mode(chroma_loc->x, chroma_loc->y, chroma_loc->width, chroma_loc->height,
-        tree_type != UVG_CHROMA_T ? lcu : NULL,
-        tree_type == UVG_CHROMA_T ? state->tile->frame->cu_array : NULL,   is_separate_tree ? UVG_CHROMA_T : tree_type);
+      int8_t luma_dir = uvg_get_co_located_luma_mode(chroma_loc,cu_loc , cur_cu, tree_type != UVG_CHROMA_T ? lcu : NULL,
+              tree_type == UVG_CHROMA_T ? state->tile->frame->cu_array : NULL,
+              is_separate_tree ? UVG_CHROMA_T : tree_type);
       encode_chroma_intra_cu(cabac, cur_cu, state->encoder_control->cfg.cclm, luma_dir, &bits);
     }
   }
diff --git a/src/intra.c b/src/intra.c
index bcc04359..778b779d 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -1603,22 +1603,26 @@ void uvg_intra_predict(
 
 // This function works on luma coordinates 
 int8_t uvg_get_co_located_luma_mode(
-  int x,
-  int y,
-  int width,
-  int height,
+  const cu_loc_t* const chroma_loc,
+  const cu_loc_t* const cu_loc,
+  const cu_info_t* luma_cu,
   const lcu_t* const lcu,
   const cu_array_t* const cu_array,
   enum uvg_tree_type tree_type)
 {
+  int x = chroma_loc->x;
+  int y = chroma_loc->y;
   assert((cu_array || lcu) && !(cu_array && lcu));
   assert(tree_type != UVG_LUMA_T && "Luma only CU shouldn't need colocated luma CU");
   if(tree_type == UVG_CHROMA_T) {
-    x += width >> 1;
-    y += height >> 1;
+    x += chroma_loc->width >> 1;
+    y += chroma_loc->height >> 1;
   }
   const cu_info_t* cu;
-  if(cu_array) {
+  if (lcu && cu_loc->x <= x && x < cu_loc->x + cu_loc->width && cu_loc->y <= y && y < cu_loc->y + cu_loc->height) {
+    cu = luma_cu;
+  }
+  else if(cu_array) {
     cu = uvg_cu_array_at_const(cu_array, x, y);
   }
   else {
diff --git a/src/intra.h b/src/intra.h
index 9f2986eb..fc81e645 100644
--- a/src/intra.h
+++ b/src/intra.h
@@ -152,10 +152,9 @@ void uvg_intra_recon_cu(
   bool recon_chroma);
 
 int8_t uvg_get_co_located_luma_mode(
-  int x,
-  int y,
-  int width,
-  int height,
+  const cu_loc_t* const chroma_loc,
+  const cu_loc_t* const cu_loc,
+  const cu_info_t* luma_cu,
   const lcu_t* const lcu,
   const cu_array_t* const cu_array,
   enum uvg_tree_type tree_type);
diff --git a/src/search.c b/src/search.c
index 6a10a83e..bc55a805 100644
--- a/src/search.c
+++ b/src/search.c
@@ -1107,9 +1107,10 @@ static double search_cu(
 
           intra_search.pred_cu.joint_cb_cr = 0;
           if(tree_type == UVG_CHROMA_T || is_separate_tree) {
-            intra_mode = uvg_get_co_located_luma_mode(chroma_loc->x, chroma_loc->y, chroma_loc->width, chroma_loc->height,
-                                                    is_separate_tree ? lcu : NULL,
-                                                    tree_type == UVG_CHROMA_T ? state->tile->frame->cu_array : NULL, UVG_CHROMA_T);
+            intra_mode = uvg_get_co_located_luma_mode(
+                    chroma_loc, cu_loc, &intra_search.pred_cu, is_separate_tree ? lcu : NULL,
+                    tree_type == UVG_CHROMA_T ? state->tile->frame->cu_array : NULL,
+                    UVG_CHROMA_T);
             intra_search.pred_cu.type = CU_INTRA;
           } else  if (intra_search.pred_cu.intra.mip_flag) {
             intra_mode = 0;
@@ -1346,86 +1347,102 @@ static double search_cu(
     fwrite(&state->search_cabac.ctx, 1,  sizeof(state->search_cabac.ctx), state->encoder_control->cabac_debug_file);
   }
 
+  bool can_split[6];
+  uvg_get_possible_splits(state, cu_loc, split_tree, tree_type, can_split);
+  can_split_cu &= can_split[1] || can_split[2] || can_split[3] || can_split[4] || can_split[5];
+
   // Recursively split all the way to max search depth.
   if (can_split_cu) {
-    const int split_type = depth == 2 ? TT_VER_SPLIT : QT_SPLIT;
-    split_tree_t new_split = {
-      split_tree.split_tree | split_type << (split_tree.current_depth * 3),
-      split_tree.current_depth + 1,
-      split_tree.mtt_depth + (split_type != QT_SPLIT),
-      0
-    };
-    
-    double split_cost = 0.0;
-    int cbf = cbf_is_set_any(cur_cu->cbf);
+    lcu_t * split_lcu = MALLOC(lcu_t, 5);
+    enum split_type best_split = 0;
+    double best_split_cost = MAX_DOUBLE;
     cabac_data_t post_seach_cabac;
+    cabac_data_t best_split_cabac;
     memcpy(&post_seach_cabac, &state->search_cabac, sizeof(post_seach_cabac));
-    memcpy(&state->search_cabac, &pre_search_cabac, sizeof(post_seach_cabac));
+    for (int split_type = QT_SPLIT; split_type <= TT_VER_SPLIT; ++split_type) {
+      if (!can_split[split_type] || split_type != QT_SPLIT) continue;
+      split_tree_t new_split = {
+        split_tree.split_tree | split_type << (split_tree.current_depth * 3),
+        split_tree.current_depth + 1,
+        split_tree.mtt_depth + (split_type != QT_SPLIT),
+        0
+      };
+    
+      double split_cost = 0.0;
+      int cbf = cbf_is_set_any(cur_cu->cbf);
+      memcpy(&state->search_cabac, &pre_search_cabac, sizeof(post_seach_cabac));
 
 
-    double split_bits = 0;
-    lcu_t split_lcu;
+      double split_bits = 0;
 
-    if (cur_cu->log2_height + cur_cu->log2_width > 4) {
+      if (cur_cu->log2_height + cur_cu->log2_width > 4) {
 
-      state->search_cabac.update = 1;
-      // Add cost of cu_split_flag.
-      const cu_info_t* left_cu = NULL, * above_cu = NULL;
-      if (x) {
-        if (x_local || tree_type != UVG_CHROMA_T) {
-          left_cu = LCU_GET_CU_AT_PX(lcu, x_local - 1, y_local);
+        state->search_cabac.update = 1;
+        // Add cost of cu_split_flag.
+        const cu_info_t* left_cu = NULL, * above_cu = NULL;
+        if (x) {
+          if (x_local || tree_type != UVG_CHROMA_T) {
+            left_cu = LCU_GET_CU_AT_PX(lcu, x_local - 1, y_local);
+          }
+          else {
+            left_cu = uvg_cu_array_at_const(state->tile->frame->chroma_cu_array, (x >> 1) - 1, y >> 1);
+          }
         }
-        else {
-          left_cu = uvg_cu_array_at_const(state->tile->frame->chroma_cu_array, (x >> 1) - 1, y >> 1);
-        }
-      }
-      if (y) {
-        if (y_local || tree_type != UVG_CHROMA_T) {
-          above_cu = LCU_GET_CU_AT_PX(lcu, x_local, y_local - 1);
-        }
-        else {
-          above_cu = uvg_cu_array_at_const(state->tile->frame->chroma_cu_array, x >> 1, (y >> 1) - 1);
-        }
-      }
-      uvg_write_split_flag(
-        state,
-        &state->search_cabac,
-        left_cu,
-        above_cu, 
-        tree_type != UVG_CHROMA_T ? cu_loc : &separate_tree_chroma_loc,
-        split_tree,
-        tree_type,
-        &split_bits);
-    }
-
-    state->search_cabac.update = 0;
-    split_cost += split_bits * state->lambda;
-
-    // If skip mode was selected for the block, skip further search.
-    // Skip mode means there's no coefficients in the block, so splitting
-    // might not give any better results but takes more time to do.
-    // It is ok to interrupt the search as soon as it is known that
-    // the split costs at least as much as not splitting.
-    if (cur_cu->type == CU_NOTSET || cbf || state->encoder_control->cfg.cu_split_termination == UVG_CU_SPLIT_TERMINATION_OFF) {
-      cu_loc_t new_cu_loc[4];
-      uint8_t separate_chroma = 0;
-      const int splits = uvg_get_split_locs(cu_loc, split_type, new_cu_loc, &separate_chroma);
-      initialize_partial_work_tree(lcu, &split_lcu, cu_loc, tree_type);
-      for (int split = 0; split < splits; ++split) {
-        new_split.part_index = split;
-        split_cost += search_cu(state, 
-          &new_cu_loc[split], separate_chroma ? chroma_loc : &new_cu_loc[split],
-          &split_lcu, 
-          tree_type, new_split,
-          !separate_chroma || split == splits - 1);
-        // If there is no separate chroma the block will always have chroma, otherwise it is the last block of the split that has the chroma
-        if (split_cost > cost) {
-          break;
+        if (y) {
+          if (y_local || tree_type != UVG_CHROMA_T) {
+            above_cu = LCU_GET_CU_AT_PX(lcu, x_local, y_local - 1);
+          }
+          else {
+            above_cu = uvg_cu_array_at_const(state->tile->frame->chroma_cu_array, x >> 1, (y >> 1) - 1);
+          }
         }
+        split_tree_t count_tree = split_tree;
+        count_tree.split_tree = split_tree.split_tree | split_type << (split_tree.current_depth * 3);
+        uvg_write_split_flag(
+          state,
+          &state->search_cabac,
+          left_cu,
+          above_cu, 
+          tree_type != UVG_CHROMA_T ? cu_loc : &separate_tree_chroma_loc,
+          count_tree,
+          tree_type,
+          &split_bits);
       }
 
-    } else {
-      split_cost = INT_MAX;
+      state->search_cabac.update = 0;
+      split_cost += split_bits * state->lambda;
+
+      // If skip mode was selected for the block, skip further search.
+      // Skip mode means there's no coefficients in the block, so splitting
+      // might not give any better results but takes more time to do.
+      // It is ok to interrupt the search as soon as it is known that
+      // the split costs at least as much as not splitting.
+      if (cur_cu->type == CU_NOTSET || cbf || state->encoder_control->cfg.cu_split_termination == UVG_CU_SPLIT_TERMINATION_OFF) {
+        cu_loc_t new_cu_loc[4];
+        uint8_t separate_chroma = 0;
+        const int splits = uvg_get_split_locs(cu_loc, split_type, new_cu_loc, &separate_chroma);
+        initialize_partial_work_tree(lcu, &split_lcu[split_type - 1], cu_loc, tree_type);
+        for (int split = 0; split < splits; ++split) {
+          new_split.part_index = split;
+          split_cost += search_cu(state, 
+            &new_cu_loc[split], separate_chroma ? chroma_loc : &new_cu_loc[split],
+            &split_lcu[split_type -1], 
+            tree_type, new_split,
+            !separate_chroma || split == splits - 1);
+          // If there is no separate chroma the block will always have chroma, otherwise it is the last block of the split that has the chroma
+          if (split_cost > cost || split_cost > best_split_cost) {
+            break;
+          }
+        }
+
+      } else {
+        split_cost = INT_MAX;
+      }
+      if (split_cost < best_split_cost) {
+        best_split_cost = split_cost;
+        best_split = split_type;
+        memcpy(&best_split_cabac, &state->search_cabac, sizeof(cabac_data_t));
+      }
     }
 
     // If no search is not performed for this depth, try just the best mode
@@ -1440,7 +1457,7 @@ static double search_cu(
       && tree_type == UVG_BOTH_T)
     {
 
-      cu_info_t *cu_d1 = LCU_GET_CU_AT_PX(&split_lcu, x_local, y_local);
+      cu_info_t *cu_d1 = LCU_GET_CU_AT_PX(&split_lcu[best_split - 1], x_local, y_local);
 
       // If the best CU in depth+1 is intra and the biggest it can be, try it.
       if (cu_d1->type == CU_INTRA && (cu_d1->log2_height + 1 == cur_cu->log2_height || cu_d1->log2_width + 1 == cur_cu->log2_width)) {
@@ -1488,10 +1505,14 @@ static double search_cu(
       }
     }
 
-    if (split_cost < cost) {
+    if (best_split_cost < cost) {
       // Copy split modes to this depth.
-      cost = split_cost;
-      work_tree_copy_up(&split_lcu, lcu, state->encoder_control->cfg.jccr, tree_type, cu_loc, is_separate_tree && !has_chroma ? NULL : chroma_loc);
+      cost = best_split_cost;
+      memcpy(&state->search_cabac, &best_split_cabac, sizeof(best_split_cabac));
+      work_tree_copy_up(&split_lcu[best_split -1], lcu, state->encoder_control->cfg.jccr, tree_type, cu_loc, is_separate_tree && !has_chroma ? NULL : chroma_loc);
+      downsample_cclm_rec(
+        state, x, y, cu_width / 2, cu_height / 2, lcu->rec.y, lcu->left_ref.y[64]
+      );
 #if UVG_DEBUG
       //debug_split = 1;
 #endif
@@ -1522,6 +1543,7 @@ static double search_cu(
         state, x, y, cu_width / 2, cu_height / 2, lcu->rec.y, lcu->left_ref.y[64]
       );      
     }
+    FREE_POINTER(split_lcu);
   } else if (cur_cu->log2_height + cur_cu->log2_width > 4) {
     // Need to copy modes down since the lower level of the work tree is used
     // when searching SMP and AMP blocks.
@@ -1745,11 +1767,11 @@ void uvg_search_lcu(encoder_state_t * const state, const int x, const int y, con
   double cost = search_cu(
     state, 
     &start,
-    NULL,
+    &start,
     &work_tree,
     tree_type,
     split_tree,
-    false);
+    true);
 
   // Save squared cost for rate control.
   if(state->encoder_control->cfg.rc_algorithm == UVG_LAMBDA) {
@@ -1766,10 +1788,10 @@ void uvg_search_lcu(encoder_state_t * const state, const int x, const int y, con
   if(state->frame->slicetype == UVG_SLICE_I && state->encoder_control->cfg.dual_tree) {
     cost = search_cu(
       state, &start,
-      NULL,
+      &start,
       &work_tree, UVG_CHROMA_T,
       split_tree,
-      false);
+      true);
 
     if (state->encoder_control->cfg.rc_algorithm == UVG_LAMBDA) {
       uvg_get_lcu_stats(state, x / LCU_WIDTH, y / LCU_WIDTH)->weight += cost * cost;

From 8fbefc0de311df7fc2d2fdfe15114b8115295eb2 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Tue, 29 Nov 2022 07:47:05 +0200
Subject: [PATCH 127/254] [mtt] fix cost calculation

---
 src/search.c                             |  24 +-
 src/search_inter.c                       |   6 +-
 src/search_intra.c                       |   8 +-
 src/strategies/avx2/picture-avx2.c       |   3 +-
 src/strategies/generic/picture-generic.c | 577 ++++++++++++++++++++++-
 src/strategies/strategies-picture.c      |   1 +
 src/strategies/strategies-picture.h      |   4 +-
 src/transform.c                          |   6 +-
 8 files changed, 603 insertions(+), 26 deletions(-)

diff --git a/src/search.c b/src/search.c
index bc55a805..3c76dc93 100644
--- a/src/search.c
+++ b/src/search.c
@@ -298,16 +298,16 @@ static double cu_zero_coeff_cost(
   double ssd = 0.0;
   ssd += UVG_LUMA_MULT * uvg_pixels_calc_ssd(
     &lcu->ref.y[luma_index], &lcu->rec.y[luma_index],
-    LCU_WIDTH, LCU_WIDTH, cu_loc->width
+    LCU_WIDTH, LCU_WIDTH, cu_loc->width, cu_loc->height
     );
   if (y_local % 8 == 0 && x_local % 8 == 0 && state->encoder_control->chroma_format != UVG_CSP_400) {
     ssd += UVG_CHROMA_MULT * uvg_pixels_calc_ssd(
       &lcu->ref.u[chroma_index], &lcu->rec.u[chroma_index],
-      LCU_WIDTH_C, LCU_WIDTH_C, cu_loc->chroma_width
+      LCU_WIDTH_C, LCU_WIDTH_C, cu_loc->chroma_width, cu_loc->chroma_height
       );
     ssd += UVG_CHROMA_MULT * uvg_pixels_calc_ssd(
       &lcu->ref.v[chroma_index], &lcu->rec.v[chroma_index],
-      LCU_WIDTH_C, LCU_WIDTH_C, cu_loc->chroma_width
+      LCU_WIDTH_C, LCU_WIDTH_C, cu_loc->chroma_width, cu_loc->chroma_height
       );
   }
   // Save the pixels at a lower level of the working tree.
@@ -445,7 +445,7 @@ double uvg_cu_rd_cost_luma(
     int index = cu_loc->local_y * LCU_WIDTH + cu_loc->local_x;
     ssd = uvg_pixels_calc_ssd(&lcu->ref.y[index], &lcu->rec.y[index],
                                         LCU_WIDTH,          LCU_WIDTH,
-                                        cu_loc->width);
+                                        cu_loc->width, cu_loc->height);
   }
 
 
@@ -550,10 +550,10 @@ double uvg_cu_rd_cost_chroma(
     int index = lcu_px.y * LCU_WIDTH_C + lcu_px.x;
     int ssd_u = uvg_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index],
                                     LCU_WIDTH_C,         LCU_WIDTH_C,
-                                    cu_loc->chroma_width);
+                                    cu_loc->chroma_width, cu_loc->chroma_height);
     int ssd_v = uvg_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index],
                                     LCU_WIDTH_C,        LCU_WIDTH_C,
-                                    cu_loc->chroma_width);
+                                    cu_loc->chroma_width, cu_loc->chroma_height);
     ssd = ssd_u + ssd_v;
   }
 
@@ -684,7 +684,7 @@ static double cu_rd_cost_tr_split_accurate(
     int index = cu_loc->local_x + LCU_WIDTH * cu_loc->local_y;
     luma_ssd = uvg_pixels_calc_ssd(&lcu->ref.y[index], &lcu->rec.y[index],
       LCU_WIDTH, LCU_WIDTH,
-      width);
+      width, height);
   }
   // Chroma transform skip enable/disable is non-normative, so we need to count the chroma
   // tr-skip bits even when we are never using it.
@@ -762,10 +762,10 @@ static double cu_rd_cost_tr_split_accurate(
         int index = lcu_px.y * LCU_WIDTH_C + lcu_px.x;
         unsigned ssd_u = uvg_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index],
           LCU_WIDTH_C, LCU_WIDTH_C,
-          chroma_width);
+          chroma_width, chroma_height);
         unsigned ssd_v = uvg_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index],
           LCU_WIDTH_C, LCU_WIDTH_C,
-          chroma_width);
+          chroma_width, chroma_height);
         chroma_ssd = ssd_u + ssd_v;
       }
       if(chroma_can_use_tr_skip && cb_flag_u) {
@@ -783,10 +783,10 @@ static double cu_rd_cost_tr_split_accurate(
         int index = lcu_px.y * LCU_WIDTH_C + lcu_px.x;
         int ssd_u_joint = uvg_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index],
           LCU_WIDTH_C, LCU_WIDTH_C,
-          chroma_width);
+          chroma_width, chroma_height);
         int ssd_v_joint = uvg_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index],
           LCU_WIDTH_C, LCU_WIDTH_C,
-          chroma_width);
+          chroma_width, chroma_height);
         chroma_ssd = ssd_u_joint + ssd_v_joint;
       }
       if (chroma_can_use_tr_skip) {
@@ -1360,7 +1360,7 @@ static double search_cu(
     cabac_data_t best_split_cabac;
     memcpy(&post_seach_cabac, &state->search_cabac, sizeof(post_seach_cabac));
     for (int split_type = QT_SPLIT; split_type <= TT_VER_SPLIT; ++split_type) {
-      if (!can_split[split_type] || split_type != QT_SPLIT) continue;
+      if (!can_split[split_type] || (split_type != QT_SPLIT && depth == 0) || (split_type == QT_SPLIT && depth == 1)) continue;
       split_tree_t new_split = {
         split_tree.split_tree | split_type << (split_tree.current_depth * 3),
         split_tree.current_depth + 1,
diff --git a/src/search_inter.c b/src/search_inter.c
index 92a62795..76c7fc36 100644
--- a/src/search_inter.c
+++ b/src/search_inter.c
@@ -2144,15 +2144,15 @@ void uvg_cu_cost_inter_rd2(
   int index = y_px * LCU_WIDTH + x_px;
   double ssd = uvg_pixels_calc_ssd(&lcu->ref.y[index], &lcu->rec.y[index],
                                    LCU_WIDTH, LCU_WIDTH,
-                                   width) * UVG_LUMA_MULT;
+                                   width, height) * UVG_LUMA_MULT;
   if (reconstruct_chroma) {
     int index = y_px / 2 * LCU_WIDTH_C + x_px / 2;
     double ssd_u = uvg_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index],
                                        LCU_WIDTH_C, LCU_WIDTH_C,
-                                       cu_loc->chroma_width);
+                                       cu_loc->chroma_width, cu_loc->chroma_height);
     double ssd_v = uvg_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index],
                                        LCU_WIDTH_C, LCU_WIDTH_C,
-                                       cu_loc->chroma_width);
+                                       cu_loc->chroma_width, cu_loc->chroma_height);
     ssd += (ssd_u + ssd_v) * UVG_CHROMA_MULT;
   }
   double no_cbf_bits;
diff --git a/src/search_intra.c b/src/search_intra.c
index 1ed00943..2a406076 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -145,15 +145,15 @@ static void get_cost_dual(
   if (satd_twin_func != NULL) {
     satd_twin_func(preds, orig_block, PARALLEL_BLKS, satd_costs);
   } else {
-    satd_costs[0] = uvg_satd_any_size(width, height, preds[0], width, orig_block, LCU_WIDTH);
-    satd_costs[1] = uvg_satd_any_size(width, height, preds[1], width, orig_block, LCU_WIDTH);
+    satd_costs[0] = uvg_satd_any_size_vtm(width, height, orig_block, width, preds[0], width);
+    satd_costs[1] = uvg_satd_any_size_vtm(width, height, orig_block, width, preds[1], width);
   }
   unsigned unsigned_sad_costs[PARALLEL_BLKS] = { 0 };
   if (sad_twin_func != NULL) {
     sad_twin_func(preds, orig_block, PARALLEL_BLKS, unsigned_sad_costs);
   } else {
-    unsigned_sad_costs[0] = uvg_reg_sad(preds[0], orig_block, width, height, width, LCU_WIDTH);
-    unsigned_sad_costs[1] = uvg_reg_sad(preds[1], orig_block, width, height, width, LCU_WIDTH);
+    unsigned_sad_costs[0] = uvg_reg_sad(preds[0], orig_block, width, height, width, width);
+    unsigned_sad_costs[1] = uvg_reg_sad(preds[1], orig_block, width, height, width, width);
   }
   costs_out[0] = (double)MIN(satd_costs[0], unsigned_sad_costs[0] * 2);
   costs_out[1] = (double)MIN(satd_costs[1], unsigned_sad_costs[1] * 2);
diff --git a/src/strategies/avx2/picture-avx2.c b/src/strategies/avx2/picture-avx2.c
index a911928d..5d0b203c 100644
--- a/src/strategies/avx2/picture-avx2.c
+++ b/src/strategies/avx2/picture-avx2.c
@@ -716,8 +716,9 @@ SATD_ANY_SIZE_MULTI_AVX2(quad_avx2, 4)
 
 static unsigned pixels_calc_ssd_avx2(const uint8_t *const ref, const uint8_t *const rec,
                  const int ref_stride, const int rec_stride,
-                 const int width)
+                 const int width, const int height)
 {
+  assert(width == height && "Non square not yet implemented");
   __m256i ssd_part;
   __m256i diff = _mm256_setzero_si256();
   __m128i sum;
diff --git a/src/strategies/generic/picture-generic.c b/src/strategies/generic/picture-generic.c
index 6797a669..d6e3c81c 100644
--- a/src/strategies/generic/picture-generic.c
+++ b/src/strategies/generic/picture-generic.c
@@ -32,6 +32,7 @@
 
 #include "strategies/generic/picture-generic.h"
 
+#include <math.h>
 #include <stdlib.h>
 
 #include "strategies/strategies-picture.h"
@@ -474,6 +475,577 @@ SATD_DUAL_NXN(64, uvg_pixel)
 
 SATD_ANY_SIZE_MULTI_GENERIC(quad_generic, 4)
 
+uint64_t xCalcHADs2x2(const uvg_pixel* piOrg, const uvg_pixel* piCur, int iStrideOrg, int iStrideCur)
+{
+  uint64_t satd = 0;
+  coeff_t diff[4], m[4];
+
+  diff[0] = piOrg[0] - piCur[0];
+  diff[1] = piOrg[1] - piCur[1];
+  diff[2] = piOrg[iStrideOrg] - piCur[0 + iStrideCur];
+  diff[3] = piOrg[iStrideOrg + 1] - piCur[1 + iStrideCur];
+  m[0] = diff[0] + diff[2];
+  m[1] = diff[1] + diff[3];
+  m[2] = diff[0] - diff[2];
+  m[3] = diff[1] - diff[3];
+
+  satd += abs(m[0] + m[1]) >> 2;
+  satd += abs(m[0] - m[1]);
+  satd += abs(m[2] + m[3]);
+  satd += abs(m[2] - m[3]);
+
+  return satd;
+}
+
+
+static uint64_t xCalcHADs16x8(const uvg_pixel* piOrg, const uvg_pixel* piCur, int iStrideOrg, int iStrideCur)
+{   //need to add SIMD implementation ,JCA
+  int k, i, j, jj, sad = 0;
+  int diff[128], m1[8][16], m2[8][16];
+  for (k = 0; k < 128; k += 16)
+  {
+    diff[k + 0] = piOrg[0] - piCur[0];
+    diff[k + 1] = piOrg[1] - piCur[1];
+    diff[k + 2] = piOrg[2] - piCur[2];
+    diff[k + 3] = piOrg[3] - piCur[3];
+    diff[k + 4] = piOrg[4] - piCur[4];
+    diff[k + 5] = piOrg[5] - piCur[5];
+    diff[k + 6] = piOrg[6] - piCur[6];
+    diff[k + 7] = piOrg[7] - piCur[7];
+
+    diff[k + 8] = piOrg[8] - piCur[8];
+    diff[k + 9] = piOrg[9] - piCur[9];
+    diff[k + 10] = piOrg[10] - piCur[10];
+    diff[k + 11] = piOrg[11] - piCur[11];
+    diff[k + 12] = piOrg[12] - piCur[12];
+    diff[k + 13] = piOrg[13] - piCur[13];
+    diff[k + 14] = piOrg[14] - piCur[14];
+    diff[k + 15] = piOrg[15] - piCur[15];
+
+    piCur += iStrideCur;
+    piOrg += iStrideOrg;
+  }
+
+  //horizontal
+  for (j = 0; j < 8; j++)
+  {
+    jj = j << 4;
+
+    m2[j][0] = diff[jj] + diff[jj + 8];
+    m2[j][1] = diff[jj + 1] + diff[jj + 9];
+    m2[j][2] = diff[jj + 2] + diff[jj + 10];
+    m2[j][3] = diff[jj + 3] + diff[jj + 11];
+    m2[j][4] = diff[jj + 4] + diff[jj + 12];
+    m2[j][5] = diff[jj + 5] + diff[jj + 13];
+    m2[j][6] = diff[jj + 6] + diff[jj + 14];
+    m2[j][7] = diff[jj + 7] + diff[jj + 15];
+    m2[j][8] = diff[jj] - diff[jj + 8];
+    m2[j][9] = diff[jj + 1] - diff[jj + 9];
+    m2[j][10] = diff[jj + 2] - diff[jj + 10];
+    m2[j][11] = diff[jj + 3] - diff[jj + 11];
+    m2[j][12] = diff[jj + 4] - diff[jj + 12];
+    m2[j][13] = diff[jj + 5] - diff[jj + 13];
+    m2[j][14] = diff[jj + 6] - diff[jj + 14];
+    m2[j][15] = diff[jj + 7] - diff[jj + 15];
+
+    m1[j][0] = m2[j][0] + m2[j][4];
+    m1[j][1] = m2[j][1] + m2[j][5];
+    m1[j][2] = m2[j][2] + m2[j][6];
+    m1[j][3] = m2[j][3] + m2[j][7];
+    m1[j][4] = m2[j][0] - m2[j][4];
+    m1[j][5] = m2[j][1] - m2[j][5];
+    m1[j][6] = m2[j][2] - m2[j][6];
+    m1[j][7] = m2[j][3] - m2[j][7];
+    m1[j][8] = m2[j][8] + m2[j][12];
+    m1[j][9] = m2[j][9] + m2[j][13];
+    m1[j][10] = m2[j][10] + m2[j][14];
+    m1[j][11] = m2[j][11] + m2[j][15];
+    m1[j][12] = m2[j][8] - m2[j][12];
+    m1[j][13] = m2[j][9] - m2[j][13];
+    m1[j][14] = m2[j][10] - m2[j][14];
+    m1[j][15] = m2[j][11] - m2[j][15];
+
+    m2[j][0] = m1[j][0] + m1[j][2];
+    m2[j][1] = m1[j][1] + m1[j][3];
+    m2[j][2] = m1[j][0] - m1[j][2];
+    m2[j][3] = m1[j][1] - m1[j][3];
+    m2[j][4] = m1[j][4] + m1[j][6];
+    m2[j][5] = m1[j][5] + m1[j][7];
+    m2[j][6] = m1[j][4] - m1[j][6];
+    m2[j][7] = m1[j][5] - m1[j][7];
+    m2[j][8] = m1[j][8] + m1[j][10];
+    m2[j][9] = m1[j][9] + m1[j][11];
+    m2[j][10] = m1[j][8] - m1[j][10];
+    m2[j][11] = m1[j][9] - m1[j][11];
+    m2[j][12] = m1[j][12] + m1[j][14];
+    m2[j][13] = m1[j][13] + m1[j][15];
+    m2[j][14] = m1[j][12] - m1[j][14];
+    m2[j][15] = m1[j][13] - m1[j][15];
+
+    m1[j][0] = m2[j][0] + m2[j][1];
+    m1[j][1] = m2[j][0] - m2[j][1];
+    m1[j][2] = m2[j][2] + m2[j][3];
+    m1[j][3] = m2[j][2] - m2[j][3];
+    m1[j][4] = m2[j][4] + m2[j][5];
+    m1[j][5] = m2[j][4] - m2[j][5];
+    m1[j][6] = m2[j][6] + m2[j][7];
+    m1[j][7] = m2[j][6] - m2[j][7];
+    m1[j][8] = m2[j][8] + m2[j][9];
+    m1[j][9] = m2[j][8] - m2[j][9];
+    m1[j][10] = m2[j][10] + m2[j][11];
+    m1[j][11] = m2[j][10] - m2[j][11];
+    m1[j][12] = m2[j][12] + m2[j][13];
+    m1[j][13] = m2[j][12] - m2[j][13];
+    m1[j][14] = m2[j][14] + m2[j][15];
+    m1[j][15] = m2[j][14] - m2[j][15];
+  }
+
+  //vertical
+  for (i = 0; i < 16; i++)
+  {
+    m2[0][i] = m1[0][i] + m1[4][i];
+    m2[1][i] = m1[1][i] + m1[5][i];
+    m2[2][i] = m1[2][i] + m1[6][i];
+    m2[3][i] = m1[3][i] + m1[7][i];
+    m2[4][i] = m1[0][i] - m1[4][i];
+    m2[5][i] = m1[1][i] - m1[5][i];
+    m2[6][i] = m1[2][i] - m1[6][i];
+    m2[7][i] = m1[3][i] - m1[7][i];
+
+    m1[0][i] = m2[0][i] + m2[2][i];
+    m1[1][i] = m2[1][i] + m2[3][i];
+    m1[2][i] = m2[0][i] - m2[2][i];
+    m1[3][i] = m2[1][i] - m2[3][i];
+    m1[4][i] = m2[4][i] + m2[6][i];
+    m1[5][i] = m2[5][i] + m2[7][i];
+    m1[6][i] = m2[4][i] - m2[6][i];
+    m1[7][i] = m2[5][i] - m2[7][i];
+
+    m2[0][i] = m1[0][i] + m1[1][i];
+    m2[1][i] = m1[0][i] - m1[1][i];
+    m2[2][i] = m1[2][i] + m1[3][i];
+    m2[3][i] = m1[2][i] - m1[3][i];
+    m2[4][i] = m1[4][i] + m1[5][i];
+    m2[5][i] = m1[4][i] - m1[5][i];
+    m2[6][i] = m1[6][i] + m1[7][i];
+    m2[7][i] = m1[6][i] - m1[7][i];
+  }
+
+  for (i = 0; i < 8; i++)
+  {
+    for (j = 0; j < 16; j++)
+    {
+      sad += abs(m2[i][j]);
+    }
+  }
+
+  sad -= abs(m2[0][0]);
+  sad += abs(m2[0][0]) >> 2;
+  sad = (int)(sad / sqrt(16.0 * 8) * 2);
+
+  return sad;
+}
+
+static uint64_t xCalcHADs8x16(const uvg_pixel* piOrg, const uvg_pixel* piCur, int iStrideOrg, int iStrideCur)
+{
+  int k, i, j, jj, sad = 0;
+  int diff[128], m1[16][8], m2[16][8];
+  for (k = 0; k < 128; k += 8)
+  {
+    diff[k + 0] = piOrg[0] - piCur[0];
+    diff[k + 1] = piOrg[1] - piCur[1];
+    diff[k + 2] = piOrg[2] - piCur[2];
+    diff[k + 3] = piOrg[3] - piCur[3];
+    diff[k + 4] = piOrg[4] - piCur[4];
+    diff[k + 5] = piOrg[5] - piCur[5];
+    diff[k + 6] = piOrg[6] - piCur[6];
+    diff[k + 7] = piOrg[7] - piCur[7];
+
+    piCur += iStrideCur;
+    piOrg += iStrideOrg;
+  }
+
+  //horizontal
+  for (j = 0; j < 16; j++)
+  {
+    jj = j << 3;
+
+    m2[j][0] = diff[jj] + diff[jj + 4];
+    m2[j][1] = diff[jj + 1] + diff[jj + 5];
+    m2[j][2] = diff[jj + 2] + diff[jj + 6];
+    m2[j][3] = diff[jj + 3] + diff[jj + 7];
+    m2[j][4] = diff[jj] - diff[jj + 4];
+    m2[j][5] = diff[jj + 1] - diff[jj + 5];
+    m2[j][6] = diff[jj + 2] - diff[jj + 6];
+    m2[j][7] = diff[jj + 3] - diff[jj + 7];
+
+    m1[j][0] = m2[j][0] + m2[j][2];
+    m1[j][1] = m2[j][1] + m2[j][3];
+    m1[j][2] = m2[j][0] - m2[j][2];
+    m1[j][3] = m2[j][1] - m2[j][3];
+    m1[j][4] = m2[j][4] + m2[j][6];
+    m1[j][5] = m2[j][5] + m2[j][7];
+    m1[j][6] = m2[j][4] - m2[j][6];
+    m1[j][7] = m2[j][5] - m2[j][7];
+
+    m2[j][0] = m1[j][0] + m1[j][1];
+    m2[j][1] = m1[j][0] - m1[j][1];
+    m2[j][2] = m1[j][2] + m1[j][3];
+    m2[j][3] = m1[j][2] - m1[j][3];
+    m2[j][4] = m1[j][4] + m1[j][5];
+    m2[j][5] = m1[j][4] - m1[j][5];
+    m2[j][6] = m1[j][6] + m1[j][7];
+    m2[j][7] = m1[j][6] - m1[j][7];
+  }
+
+  //vertical
+  for (i = 0; i < 8; i++)
+  {
+    m1[0][i] = m2[0][i] + m2[8][i];
+    m1[1][i] = m2[1][i] + m2[9][i];
+    m1[2][i] = m2[2][i] + m2[10][i];
+    m1[3][i] = m2[3][i] + m2[11][i];
+    m1[4][i] = m2[4][i] + m2[12][i];
+    m1[5][i] = m2[5][i] + m2[13][i];
+    m1[6][i] = m2[6][i] + m2[14][i];
+    m1[7][i] = m2[7][i] + m2[15][i];
+    m1[8][i] = m2[0][i] - m2[8][i];
+    m1[9][i] = m2[1][i] - m2[9][i];
+    m1[10][i] = m2[2][i] - m2[10][i];
+    m1[11][i] = m2[3][i] - m2[11][i];
+    m1[12][i] = m2[4][i] - m2[12][i];
+    m1[13][i] = m2[5][i] - m2[13][i];
+    m1[14][i] = m2[6][i] - m2[14][i];
+    m1[15][i] = m2[7][i] - m2[15][i];
+
+    m2[0][i] = m1[0][i] + m1[4][i];
+    m2[1][i] = m1[1][i] + m1[5][i];
+    m2[2][i] = m1[2][i] + m1[6][i];
+    m2[3][i] = m1[3][i] + m1[7][i];
+    m2[4][i] = m1[0][i] - m1[4][i];
+    m2[5][i] = m1[1][i] - m1[5][i];
+    m2[6][i] = m1[2][i] - m1[6][i];
+    m2[7][i] = m1[3][i] - m1[7][i];
+    m2[8][i] = m1[8][i] + m1[12][i];
+    m2[9][i] = m1[9][i] + m1[13][i];
+    m2[10][i] = m1[10][i] + m1[14][i];
+    m2[11][i] = m1[11][i] + m1[15][i];
+    m2[12][i] = m1[8][i] - m1[12][i];
+    m2[13][i] = m1[9][i] - m1[13][i];
+    m2[14][i] = m1[10][i] - m1[14][i];
+    m2[15][i] = m1[11][i] - m1[15][i];
+
+    m1[0][i] = m2[0][i] + m2[2][i];
+    m1[1][i] = m2[1][i] + m2[3][i];
+    m1[2][i] = m2[0][i] - m2[2][i];
+    m1[3][i] = m2[1][i] - m2[3][i];
+    m1[4][i] = m2[4][i] + m2[6][i];
+    m1[5][i] = m2[5][i] + m2[7][i];
+    m1[6][i] = m2[4][i] - m2[6][i];
+    m1[7][i] = m2[5][i] - m2[7][i];
+    m1[8][i] = m2[8][i] + m2[10][i];
+    m1[9][i] = m2[9][i] + m2[11][i];
+    m1[10][i] = m2[8][i] - m2[10][i];
+    m1[11][i] = m2[9][i] - m2[11][i];
+    m1[12][i] = m2[12][i] + m2[14][i];
+    m1[13][i] = m2[13][i] + m2[15][i];
+    m1[14][i] = m2[12][i] - m2[14][i];
+    m1[15][i] = m2[13][i] - m2[15][i];
+
+    m2[0][i] = m1[0][i] + m1[1][i];
+    m2[1][i] = m1[0][i] - m1[1][i];
+    m2[2][i] = m1[2][i] + m1[3][i];
+    m2[3][i] = m1[2][i] - m1[3][i];
+    m2[4][i] = m1[4][i] + m1[5][i];
+    m2[5][i] = m1[4][i] - m1[5][i];
+    m2[6][i] = m1[6][i] + m1[7][i];
+    m2[7][i] = m1[6][i] - m1[7][i];
+    m2[8][i] = m1[8][i] + m1[9][i];
+    m2[9][i] = m1[8][i] - m1[9][i];
+    m2[10][i] = m1[10][i] + m1[11][i];
+    m2[11][i] = m1[10][i] - m1[11][i];
+    m2[12][i] = m1[12][i] + m1[13][i];
+    m2[13][i] = m1[12][i] - m1[13][i];
+    m2[14][i] = m1[14][i] + m1[15][i];
+    m2[15][i] = m1[14][i] - m1[15][i];
+  }
+
+  for (i = 0; i < 16; i++)
+  {
+    for (j = 0; j < 8; j++)
+    {
+      sad += abs(m2[i][j]);
+    }
+  }
+
+  sad -= abs(m2[0][0]);
+  sad += abs(m2[0][0]) >> 2;
+  sad = (int)(sad / sqrt(16.0 * 8) * 2);
+
+  return sad;
+}
+
+static uint64_t xCalcHADs4x8(const uvg_pixel* piOrg, const uvg_pixel* piCur, int iStrideOrg, int iStrideCur)
+{
+  int k, i, j, jj, sad = 0;
+  int diff[32], m1[8][4], m2[8][4];
+  for (k = 0; k < 32; k += 4)
+  {
+    diff[k + 0] = piOrg[0] - piCur[0];
+    diff[k + 1] = piOrg[1] - piCur[1];
+    diff[k + 2] = piOrg[2] - piCur[2];
+    diff[k + 3] = piOrg[3] - piCur[3];
+
+    piCur += iStrideCur;
+    piOrg += iStrideOrg;
+  }
+
+  //horizontal
+  for (j = 0; j < 8; j++)
+  {
+    jj = j << 2;
+    m2[j][0] = diff[jj] + diff[jj + 2];
+    m2[j][1] = diff[jj + 1] + diff[jj + 3];
+    m2[j][2] = diff[jj] - diff[jj + 2];
+    m2[j][3] = diff[jj + 1] - diff[jj + 3];
+
+    m1[j][0] = m2[j][0] + m2[j][1];
+    m1[j][1] = m2[j][0] - m2[j][1];
+    m1[j][2] = m2[j][2] + m2[j][3];
+    m1[j][3] = m2[j][2] - m2[j][3];
+  }
+
+  //vertical
+  for (i = 0; i < 4; i++)
+  {
+    m2[0][i] = m1[0][i] + m1[4][i];
+    m2[1][i] = m1[1][i] + m1[5][i];
+    m2[2][i] = m1[2][i] + m1[6][i];
+    m2[3][i] = m1[3][i] + m1[7][i];
+    m2[4][i] = m1[0][i] - m1[4][i];
+    m2[5][i] = m1[1][i] - m1[5][i];
+    m2[6][i] = m1[2][i] - m1[6][i];
+    m2[7][i] = m1[3][i] - m1[7][i];
+
+    m1[0][i] = m2[0][i] + m2[2][i];
+    m1[1][i] = m2[1][i] + m2[3][i];
+    m1[2][i] = m2[0][i] - m2[2][i];
+    m1[3][i] = m2[1][i] - m2[3][i];
+    m1[4][i] = m2[4][i] + m2[6][i];
+    m1[5][i] = m2[5][i] + m2[7][i];
+    m1[6][i] = m2[4][i] - m2[6][i];
+    m1[7][i] = m2[5][i] - m2[7][i];
+
+    m2[0][i] = m1[0][i] + m1[1][i];
+    m2[1][i] = m1[0][i] - m1[1][i];
+    m2[2][i] = m1[2][i] + m1[3][i];
+    m2[3][i] = m1[2][i] - m1[3][i];
+    m2[4][i] = m1[4][i] + m1[5][i];
+    m2[5][i] = m1[4][i] - m1[5][i];
+    m2[6][i] = m1[6][i] + m1[7][i];
+    m2[7][i] = m1[6][i] - m1[7][i];
+  }
+
+  for (i = 0; i < 8; i++)
+  {
+    for (j = 0; j < 4; j++)
+    {
+      sad += abs(m2[i][j]);
+    }
+  }
+
+  sad -= abs(m2[0][0]);
+  sad += abs(m2[0][0]) >> 2;
+  sad = (int)(sad / sqrt(4.0 * 8) * 2);
+
+  return sad;
+}
+
+static uint64_t xCalcHADs8x4(const uvg_pixel* piOrg, const uvg_pixel* piCur, int iStrideOrg, int iStrideCur)
+{
+  int k, i, j, jj, sad = 0;
+  int diff[32], m1[4][8], m2[4][8];
+  for (k = 0; k < 32; k += 8)
+  {
+    diff[k + 0] = piOrg[0] - piCur[0];
+    diff[k + 1] = piOrg[1] - piCur[1];
+    diff[k + 2] = piOrg[2] - piCur[2];
+    diff[k + 3] = piOrg[3] - piCur[3];
+    diff[k + 4] = piOrg[4] - piCur[4];
+    diff[k + 5] = piOrg[5] - piCur[5];
+    diff[k + 6] = piOrg[6] - piCur[6];
+    diff[k + 7] = piOrg[7] - piCur[7];
+
+    piCur += iStrideCur;
+    piOrg += iStrideOrg;
+  }
+
+  //horizontal
+  for (j = 0; j < 4; j++)
+  {
+    jj = j << 3;
+
+    m2[j][0] = diff[jj] + diff[jj + 4];
+    m2[j][1] = diff[jj + 1] + diff[jj + 5];
+    m2[j][2] = diff[jj + 2] + diff[jj + 6];
+    m2[j][3] = diff[jj + 3] + diff[jj + 7];
+    m2[j][4] = diff[jj] - diff[jj + 4];
+    m2[j][5] = diff[jj + 1] - diff[jj + 5];
+    m2[j][6] = diff[jj + 2] - diff[jj + 6];
+    m2[j][7] = diff[jj + 3] - diff[jj + 7];
+
+    m1[j][0] = m2[j][0] + m2[j][2];
+    m1[j][1] = m2[j][1] + m2[j][3];
+    m1[j][2] = m2[j][0] - m2[j][2];
+    m1[j][3] = m2[j][1] - m2[j][3];
+    m1[j][4] = m2[j][4] + m2[j][6];
+    m1[j][5] = m2[j][5] + m2[j][7];
+    m1[j][6] = m2[j][4] - m2[j][6];
+    m1[j][7] = m2[j][5] - m2[j][7];
+
+    m2[j][0] = m1[j][0] + m1[j][1];
+    m2[j][1] = m1[j][0] - m1[j][1];
+    m2[j][2] = m1[j][2] + m1[j][3];
+    m2[j][3] = m1[j][2] - m1[j][3];
+    m2[j][4] = m1[j][4] + m1[j][5];
+    m2[j][5] = m1[j][4] - m1[j][5];
+    m2[j][6] = m1[j][6] + m1[j][7];
+    m2[j][7] = m1[j][6] - m1[j][7];
+  }
+
+  //vertical
+  for (i = 0; i < 8; i++)
+  {
+    m1[0][i] = m2[0][i] + m2[2][i];
+    m1[1][i] = m2[1][i] + m2[3][i];
+    m1[2][i] = m2[0][i] - m2[2][i];
+    m1[3][i] = m2[1][i] - m2[3][i];
+
+    m2[0][i] = m1[0][i] + m1[1][i];
+    m2[1][i] = m1[0][i] - m1[1][i];
+    m2[2][i] = m1[2][i] + m1[3][i];
+    m2[3][i] = m1[2][i] - m1[3][i];
+  }
+
+  for (i = 0; i < 4; i++)
+  {
+    for (j = 0; j < 8; j++)
+    {
+      sad += abs(m2[i][j]);
+    }
+  }
+
+  sad -= abs(m2[0][0]);
+  sad += abs(m2[0][0]) >> 2;
+  sad = (int)(sad / sqrt(4.0 * 8) * 2);
+
+  return sad;
+}
+
+
+uint64_t xGetHADs(int width, int height, const uvg_pixel* ref_in, int ref_stride, const uvg_pixel* pred_in, int pred_stride)
+{
+  const uvg_pixel* piOrg = ref_in;
+  const uvg_pixel* piCur = pred_in;
+  const int  iRows = height;
+  const int  iCols = width;
+  const int  iStrideOrg = ref_stride;
+  const int  iStrideCur = pred_stride;
+
+  int  x = 0, y = 0;
+
+  uint64_t uiSum = 0;
+
+  if (iCols > iRows && (iRows & 7) == 0 && (iCols & 15) == 0)
+  {
+    for (y = 0; y < iRows; y += 8)
+    {
+      for (x = 0; x < iCols; x += 16)
+      {
+        uiSum += xCalcHADs16x8(&piOrg[x], &piCur[x], iStrideOrg, iStrideCur);
+      }
+      piOrg += iStrideOrg * 8;
+      piCur += iStrideCur * 8;
+    }
+  }
+  else if (iCols < iRows && (iCols & 7) == 0 && (iRows & 15) == 0)
+  {
+    for (y = 0; y < iRows; y += 16)
+    {
+      for (x = 0; x < iCols; x += 8)
+      {
+        uiSum += xCalcHADs8x16(&piOrg[x], &piCur[x], iStrideOrg, iStrideCur);
+      }
+      piOrg += iStrideOrg * 16;
+      piCur += iStrideCur * 16;
+    }
+  }
+  else if (iCols > iRows && (iRows & 3) == 0 && (iCols & 7) == 0)
+  {
+    for (y = 0; y < iRows; y += 4)
+    {
+      for (x = 0; x < iCols; x += 8)
+      {
+        uiSum += xCalcHADs8x4(&piOrg[x], &piCur[x], iStrideOrg, iStrideCur);
+      }
+      piOrg += iStrideOrg * 4;
+      piCur += iStrideCur * 4;
+    }
+  }
+  else if (iCols < iRows && (iCols & 3) == 0 && (iRows & 7) == 0)
+  {
+    for (y = 0; y < iRows; y += 8)
+    {
+      for (x = 0; x < iCols; x += 4)
+      {
+        uiSum += xCalcHADs4x8(&piOrg[x], &piCur[x], iStrideOrg, iStrideCur);
+      }
+      piOrg += iStrideOrg * 8;
+      piCur += iStrideCur * 8;
+    }
+  }
+  else if ((iRows % 8 == 0) && (iCols % 8 == 0))
+  {
+    for (y = 0; y < iRows; y += 8)
+    {
+      for (x = 0; x < iCols; x += 8)
+      {
+        uiSum += satd_8x8_subblock_generic(&piOrg[x], iStrideOrg, &piCur[x], iStrideCur);
+      }
+      piOrg += 8 * iStrideOrg;
+      piCur += 8 * iStrideCur;
+    }
+  }
+  else if ((iRows % 4 == 0) && (iCols % 4 == 0))
+  {
+    for (y = 0; y < iRows; y += 4)
+    {
+      for (x = 0; x < iCols; x += 4)
+      {
+        uiSum += uvg_satd_4x4_subblock_generic(&piOrg[x], iStrideOrg, &piCur[x], iStrideCur);
+      }
+      piOrg += 4 * iStrideOrg;
+      piCur += 4 * iStrideCur;
+    }
+  }
+  else if ((iRows % 2 == 0) && (iCols % 2 == 0))
+  {
+    for (y = 0; y < iRows; y += 2)
+    {
+      for (x = 0; x < iCols; x += 2)
+      {
+        uiSum += xCalcHADs2x2(&piOrg[x], &piCur[x], iStrideOrg, iStrideCur);
+      }
+      piOrg += 2 * iStrideOrg;
+      piCur += 2 * iStrideCur;
+    }
+  }
+
+  // TODO: 10 bit
+  return (uiSum >> 0);
+}
+
+
 // Function macro for defining SAD calculating functions
 // for fixed size blocks.
 #define SAD_NXN(n, pixel_type) \
@@ -539,12 +1111,12 @@ SAD_DUAL_NXN(64, uvg_pixel)
 
 static unsigned pixels_calc_ssd_generic(const uvg_pixel *const ref, const uvg_pixel *const rec,
                  const int ref_stride, const int rec_stride,
-                 const int width)
+                 const int width, const int height)
 {
   int ssd = 0;
   int y, x;
 
-  for (y = 0; y < width; ++y) {
+  for (y = 0; y < height; ++y) {
     for (x = 0; x < width; ++x) {
       int diff = ref[x + y * ref_stride] - rec[x + y * rec_stride];
       ssd += diff * diff;
@@ -897,6 +1469,7 @@ int uvg_strategy_register_picture_generic(void* opaque, uint8_t bitdepth)
   success &= uvg_strategyselector_register(opaque, "satd_32x32_dual", "generic", 0, &satd_32x32_dual_generic);
   success &= uvg_strategyselector_register(opaque, "satd_64x64_dual", "generic", 0, &satd_64x64_dual_generic);
   success &= uvg_strategyselector_register(opaque, "satd_any_size", "generic", 0, &satd_any_size_generic);
+  success &= uvg_strategyselector_register(opaque, "satd_any_size_vtm", "generic", 0, &xGetHADs);
   success &= uvg_strategyselector_register(opaque, "satd_any_size_quad", "generic", 0, &satd_any_size_quad_generic);
 
   success &= uvg_strategyselector_register(opaque, "pixels_calc_ssd", "generic", 0, &pixels_calc_ssd_generic);
diff --git a/src/strategies/strategies-picture.c b/src/strategies/strategies-picture.c
index 37d3cb75..643d2f8f 100644
--- a/src/strategies/strategies-picture.c
+++ b/src/strategies/strategies-picture.c
@@ -70,6 +70,7 @@ cost_pixel_nxn_multi_func * uvg_satd_32x32_dual = 0;
 cost_pixel_nxn_multi_func * uvg_satd_64x64_dual = 0;
 
 cost_pixel_any_size_func * uvg_satd_any_size = 0;
+cost_pixel_any_size_func * uvg_satd_any_size_vtm = 0;
 cost_pixel_any_size_multi_func * uvg_satd_any_size_quad = 0;
 
 pixels_calc_ssd_func * uvg_pixels_calc_ssd = 0;
diff --git a/src/strategies/strategies-picture.h b/src/strategies/strategies-picture.h
index 286a0735..cd4e2ec5 100644
--- a/src/strategies/strategies-picture.h
+++ b/src/strategies/strategies-picture.h
@@ -124,7 +124,7 @@ typedef unsigned (cost_pixel_any_size_func)(
 typedef void (cost_pixel_nxn_multi_func)(const pred_buffer preds, const uvg_pixel *orig, unsigned num_modes, unsigned *costs_out);
 typedef void (cost_pixel_any_size_multi_func)(int width, int height, const uvg_pixel **preds, const int stride, const uvg_pixel *orig, const int orig_stride, unsigned num_modes, unsigned *costs_out, int8_t *valid);
 
-typedef unsigned (pixels_calc_ssd_func)(const uvg_pixel *const ref, const uvg_pixel *const rec, const int ref_stride, const int rec_stride, const int width);
+typedef unsigned (pixels_calc_ssd_func)(const uvg_pixel *const ref, const uvg_pixel *const rec, const int ref_stride, const int rec_stride, const int width, const int height);
 typedef optimized_sad_func_ptr_t (get_optimized_sad_func)(int32_t);
 typedef uint32_t (ver_sad_func)(const uvg_pixel *pic_data, const uvg_pixel *ref_data,
                                 int32_t block_width, int32_t block_height,
@@ -175,6 +175,7 @@ extern cost_pixel_nxn_func * uvg_satd_16x16;
 extern cost_pixel_nxn_func * uvg_satd_32x32;
 extern cost_pixel_nxn_func * uvg_satd_64x64;
 extern cost_pixel_any_size_func *uvg_satd_any_size;
+extern cost_pixel_any_size_func *uvg_satd_any_size_vtm;
 
 extern cost_pixel_nxn_multi_func * uvg_sad_4x4_dual;
 extern cost_pixel_nxn_multi_func * uvg_sad_8x8_dual;
@@ -221,6 +222,7 @@ cost_pixel_nxn_multi_func * uvg_pixels_get_sad_dual_func(unsigned width, unsigne
   {"satd_32x32", (void**) &uvg_satd_32x32}, \
   {"satd_64x64", (void**) &uvg_satd_64x64}, \
   {"satd_any_size", (void**) &uvg_satd_any_size}, \
+  {"satd_any_size_vtm", (void**) &uvg_satd_any_size_vtm}, \
   {"sad_4x4_dual", (void**) &uvg_sad_4x4_dual}, \
   {"sad_8x8_dual", (void**) &uvg_sad_8x8_dual}, \
   {"sad_16x16_dual", (void**) &uvg_sad_16x16_dual}, \
diff --git a/src/transform.c b/src/transform.c
index 26851b8d..7e2b64ee 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -617,7 +617,7 @@ void uvg_chroma_transform_search(
 
 
     if (v_has_coeffs && !is_jccr) {
-      uvg_dequant(state, v_quant_coeff, &v_coeff[i * trans_offset], width, width, COLOR_V,
+      uvg_dequant(state, v_quant_coeff, &v_coeff[i * trans_offset], width, height, COLOR_V,
         pred_cu->type, transforms[i] == CHROMA_TS);
 
       if (transforms[i] != CHROMA_TS) {
@@ -661,10 +661,10 @@ void uvg_chroma_transform_search(
     if (!state->encoder_control->cfg.lossless) {
       ssd_u = uvg_pixels_calc_ssd(&lcu->ref.u[offset], &u_recon[trans_offset * i],
         LCU_WIDTH_C, width,
-        width);
+        width, height);
       ssd_v = uvg_pixels_calc_ssd(&lcu->ref.v[offset], &v_recon[trans_offset * i],
         LCU_WIDTH_C, width,
-        width);
+        width, height);
     }
 
     double u_bits = 0;

From d5d9afb1e240d824acfe6b170348c953dd2c7b74 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Tue, 29 Nov 2022 12:14:58 +0200
Subject: [PATCH 128/254] [mtt] fix dual tree

---
 src/cfg.c                |  2 +-
 src/cu.c                 | 28 +++++++++++++++++++++++-----
 src/cu.h                 |  1 +
 src/encode_coding_tree.c | 28 ++++++++++++++++++++++------
 src/encoderstate.c       | 13 ++++++++++---
 src/intra.c              | 40 +++++++++++++++++++++++++++++++---------
 6 files changed, 88 insertions(+), 24 deletions(-)

diff --git a/src/cfg.c b/src/cfg.c
index 39643e9f..5ff05859 100644
--- a/src/cfg.c
+++ b/src/cfg.c
@@ -229,7 +229,7 @@ int uvg_config_init(uvg_config *cfg)
 
   cfg->max_btt_depth[0] = 1;
   cfg->max_btt_depth[1] = 0;
-  cfg->max_btt_depth[2] = 0;
+  cfg->max_btt_depth[2] = 1;
 
   cfg->max_tt_size[0] = 64;
   cfg->max_bt_size[0] = 64;
diff --git a/src/cu.c b/src/cu.c
index 3eb7a771..fcbdc15d 100644
--- a/src/cu.c
+++ b/src/cu.c
@@ -407,11 +407,7 @@ int uvg_get_possible_splits(const encoder_state_t * const state,
   if (width <= min_qt_size)                              splits[QT_SPLIT] = false;
 
   if (tree_type == UVG_CHROMA_T && width <= 4) splits[QT_SPLIT] = false;
-  if (tree_type == UVG_CHROMA_T)
-  {
-    splits[QT_SPLIT] = splits[BT_VER_SPLIT] = splits[TT_HOR_SPLIT] = splits[BT_VER_SPLIT] = splits[TT_VER_SPLIT] = false;
-    return;
-  }
+
   if (implicitSplit != NO_SPLIT)
   {
     splits[NO_SPLIT] = splits[TT_HOR_SPLIT] = splits[TT_VER_SPLIT] = false;
@@ -498,4 +494,26 @@ int uvg_count_available_edge_cus(const cu_loc_t* const cu_loc, const lcu_t* cons
     amount += TR_MIN_WIDTH;
   }
   return MAX(amount / TR_MIN_WIDTH, cu_loc->width / TR_MIN_WIDTH);
+}
+
+int uvg_count_chroma_tree_available_edge_cus(int x, int y, int width, int height, const lcu_t* const lcu, bool left)
+{
+  if (left && x == 0 || !left && y == 0) return 0;
+  const int local_x = x % LCU_WIDTH_C;
+  const int local_y = y % LCU_WIDTH_C;
+  if (left && local_x == 0) return (LCU_WIDTH_C - local_y) / 4;
+  if (!left && local_y == 0) return width / 2;
+
+  int amount = 0;
+  if(left) {
+    while (LCU_GET_CU_AT_PX(lcu, local_x - TR_MIN_WIDTH, local_y + amount)->type != CU_NOTSET && (local_y + amount) < LCU_WIDTH_C) {
+      amount += TR_MIN_WIDTH;
+    }
+    return MAX(amount / TR_MIN_WIDTH, height / TR_MIN_WIDTH);
+  }
+  while (LCU_GET_CU_AT_PX(lcu, local_x + amount, local_y - TR_MIN_WIDTH)->type != CU_NOTSET && local_x + amount < LCU_WIDTH_C) {
+    amount += TR_MIN_WIDTH;
+  }
+  return MAX(amount / TR_MIN_WIDTH, width / TR_MIN_WIDTH);
+
 }
\ No newline at end of file
diff --git a/src/cu.h b/src/cu.h
index 5855eaed..bfbb50a1 100644
--- a/src/cu.h
+++ b/src/cu.h
@@ -377,6 +377,7 @@ void uvg_cu_array_copy_from_lcu(cu_array_t* dst, int dst_x, int dst_y, const lcu
                                 tree_type);
 
 int uvg_count_available_edge_cus(const cu_loc_t* const cu_loc, const lcu_t* const lcu, bool left);
+int uvg_count_chroma_tree_available_edge_cus(int x, int y, int width, int height, const lcu_t* const lcu, bool left);
 
 /**
  * \brief Return pointer to the top right reference CU.
diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index 999da9a3..12f455f2 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -1352,13 +1352,14 @@ void uvg_encode_coding_tree(
   const encoder_control_t * const ctrl = state->encoder_control;
   const videoframe_t * const frame = state->tile->frame;
   const cu_array_t* used_array = tree_type != UVG_CHROMA_T ? frame->cu_array : frame->chroma_cu_array;
-  const cu_info_t *cur_cu   = uvg_cu_array_at_const(used_array, cu_loc->x, cu_loc->y);
   
   const int cu_width  = tree_type != UVG_CHROMA_T ? cu_loc->width : cu_loc->chroma_width;
   const int cu_height = tree_type != UVG_CHROMA_T ? cu_loc->height : cu_loc->chroma_height;
  
-  const int x = cu_loc->x;
-  const int y = cu_loc->y;
+  const int x = tree_type != UVG_CHROMA_T ? cu_loc->x : chroma_loc->x;
+  const int y = tree_type != UVG_CHROMA_T ? cu_loc->y : chroma_loc->y;
+
+  const cu_info_t* cur_cu = uvg_cu_array_at_const(used_array, x, y);
 
   const int depth = split_tree.current_depth;
 
@@ -1397,7 +1398,7 @@ void uvg_encode_coding_tree(
       cabac,
       left_cu,
       above_cu, 
-      cu_loc,
+      tree_type != UVG_CHROMA_T ? cu_loc : chroma_loc,
       split_tree,
       tree_type,
       NULL);
@@ -1406,12 +1407,24 @@ void uvg_encode_coding_tree(
       split_tree_t new_split_tree = { cur_cu->split_tree, split_tree.current_depth + 1, split_tree.mtt_depth + (split_flag != QT_SPLIT), 0};
 
       cu_loc_t new_cu_loc[4];
+      cu_loc_t chroma_tree_loc;
       uint8_t separate_chroma = 0;
       const int splits = uvg_get_split_locs(cu_loc, split_flag, new_cu_loc, &separate_chroma);
       for (int split = 0; split <splits; ++split) {
         new_split_tree.part_index = split;
+        if (tree_type == UVG_CHROMA_T) {
+          chroma_tree_loc = new_cu_loc[split];
+          chroma_tree_loc.x >>= 1;
+          chroma_tree_loc.y >>= 1;
+          chroma_tree_loc.local_x = chroma_tree_loc.x & LCU_WIDTH_C;
+          chroma_tree_loc.local_y = chroma_tree_loc.y & LCU_WIDTH_C;
+          chroma_tree_loc.width >>= 1;
+          chroma_tree_loc.height >>= 1;
+          assert(!separate_chroma);
+        }
         uvg_encode_coding_tree(state, coeff, tree_type,
-          &new_cu_loc[split], separate_chroma ? chroma_loc : &new_cu_loc[split],
+          &new_cu_loc[split], 
+          separate_chroma ? chroma_loc :(tree_type == UVG_CHROMA_T ? &chroma_tree_loc :  &new_cu_loc[split]),
           new_split_tree, !separate_chroma || split == splits - 1);
       }
       return;
@@ -1420,6 +1433,9 @@ void uvg_encode_coding_tree(
   
   DBG_YUVIEW_VALUE(state->frame->poc, DBG_YUVIEW_CU_TYPE, abs_x, abs_y, cu_width, cu_height, cur_cu->type-1);
 
+  if(tree_type==UVG_CHROMA_T)
+    fprintf(stderr, "%d %d %d %d\n", x * 2, y * 2, cu_width * 2, cu_height*2);
+
   if (ctrl->cfg.lossless) {
     cabac->cur_ctx = &cabac->ctx.cu_transquant_bypass;
     CABAC_BIN(cabac, 1, "cu_transquant_bypass_flag");
@@ -1643,7 +1659,7 @@ void uvg_encode_coding_tree(
       ((is_local_dual_tree &&
       has_chroma) || tree_type == UVG_CHROMA_T) &&
       tree_type != UVG_LUMA_T)   {
-      int8_t luma_dir = uvg_get_co_located_luma_mode(chroma_loc, cu_loc, cur_cu, NULL, frame->cu_array, UVG_CHROMA_T);
+      int8_t luma_dir = uvg_get_co_located_luma_mode(tree_type != UVG_CHROMA_T ? chroma_loc : cu_loc, cu_loc, cur_cu, NULL, frame->cu_array, UVG_CHROMA_T);
       encode_chroma_intra_cu(cabac, cur_cu, state->encoder_control->cfg.cclm, luma_dir,NULL);
       // LFNST constraints must be reset here. Otherwise the left over values will interfere when calculating new constraints
       cu_info_t* tmp = (cu_info_t*)cur_cu;
diff --git a/src/encoderstate.c b/src/encoderstate.c
index 5931f8d0..6cd9bbca 100644
--- a/src/encoderstate.c
+++ b/src/encoderstate.c
@@ -883,13 +883,20 @@ static void encoder_state_worker_encode_lcu_bitstream(void * opaque)
   //Encode coding tree
   cu_loc_t start;
   uvg_cu_loc_ctor(&start, lcu->position.x * LCU_WIDTH, lcu->position.y * LCU_WIDTH, LCU_WIDTH, LCU_WIDTH);
-  split_tree_t split_tree = { 0, 0, 0, 0 };
+  split_tree_t split_tree = { 0, 0, 0 };
 
   uvg_encode_coding_tree(state, lcu->coeff, tree_type, &start, &start, split_tree, true);
 
   if(tree_type == UVG_LUMA_T && state->encoder_control->chroma_format != UVG_CSP_400) {
-    uvg_cu_loc_ctor(&start, lcu->position.x * LCU_WIDTH_C, lcu->position.y * LCU_WIDTH_C, LCU_WIDTH, LCU_WIDTH);
-    uvg_encode_coding_tree(state, lcu->coeff, UVG_CHROMA_T, &start, &start, split_tree, true);
+    uvg_cu_loc_ctor(&start, lcu->position.x * LCU_WIDTH, lcu->position.y * LCU_WIDTH, LCU_WIDTH, LCU_WIDTH);
+    cu_loc_t chroma_tree_loc = start;
+    chroma_tree_loc.x >>= 1;
+    chroma_tree_loc.y >>= 1;
+    chroma_tree_loc.local_x = chroma_tree_loc.x & LCU_WIDTH_C;
+    chroma_tree_loc.local_y = chroma_tree_loc.y & LCU_WIDTH_C;
+    chroma_tree_loc.width >>= 1;
+    chroma_tree_loc.height >>= 1;
+    uvg_encode_coding_tree(state, lcu->coeff, UVG_CHROMA_T, &start, &chroma_tree_loc, split_tree, true);
   }
 
   if (!state->cabac.only_count) {
diff --git a/src/intra.c b/src/intra.c
index 778b779d..1dd74cb3 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -1119,9 +1119,14 @@ void uvg_intra_build_reference_any(
       }
     }
     else {
-      const int num_cus = uvg_count_available_edge_cus(cu_loc, lcu, true);      
-      px_available_left = is_dual_tree || !is_chroma ? num_cus * 4 : num_cus *2;
-      px_available_left -= px.x % 4;
+      if (!is_dual_tree) {
+        const int num_cus = uvg_count_available_edge_cus(cu_loc, lcu, true);
+        px_available_left = is_dual_tree || !is_chroma ? num_cus * 4 : num_cus * 2;
+      }
+      else {
+        const int num_cus = uvg_count_chroma_tree_available_edge_cus(cu_loc->x >> 1, cu_loc->y >> 1, width, height, lcu, true);
+        px_available_left = num_cus * 4;
+      }
     }
 
     // Limit the number of available pixels based on block size and dimensions
@@ -1242,8 +1247,14 @@ void uvg_intra_build_reference_any(
       }
     }
     else {
-      const int num_cus = uvg_count_available_edge_cus(cu_loc, lcu, false);
-      px_available_top = is_dual_tree || !is_chroma ? num_cus * 4 : num_cus * 2;
+      if (!is_dual_tree) {
+        const int num_cus = uvg_count_available_edge_cus(cu_loc, lcu, false);
+        px_available_top = is_dual_tree || !is_chroma ? num_cus * 4 : num_cus * 2;
+      }
+      else {
+        const int num_cus = uvg_count_chroma_tree_available_edge_cus(cu_loc->x >> 1, cu_loc->y >> 1, width, height, lcu, false);
+        px_available_top = num_cus * 4;
+      }
     }
     
     // Limit the number of available pixels based on block size and dimensions
@@ -1428,8 +1439,13 @@ void uvg_intra_build_reference_inner(
 
   }
   else {
-    const int num_cus = uvg_count_available_edge_cus(cu_loc, lcu, true);
-    px_available_left = is_dual_tree || !is_chroma ? num_cus * 4 : num_cus * 2;
+    if(!is_dual_tree) {
+      const int num_cus = uvg_count_available_edge_cus(cu_loc, lcu, true);
+      px_available_left = is_dual_tree || !is_chroma ? num_cus * 4 : num_cus * 2;
+    } else {
+      const int num_cus = uvg_count_chroma_tree_available_edge_cus(cu_loc->x >> 1, cu_loc->y >> 1, width, height, lcu, true);
+      px_available_left = num_cus * 4;
+    }
   }
 
   // Limit the number of available pixels based on block size and dimensions
@@ -1490,8 +1506,14 @@ void uvg_intra_build_reference_inner(
     }
   }
   else {
-    const int num_cus = uvg_count_available_edge_cus(cu_loc, lcu, false);
-    px_available_top = is_dual_tree || !is_chroma ? num_cus * 4 : num_cus * 2;
+    if (!is_dual_tree) {
+      const int num_cus = uvg_count_available_edge_cus(cu_loc, lcu, false);
+      px_available_top = is_dual_tree || !is_chroma ? num_cus * 4 : num_cus * 2;
+    }
+    else {
+      const int num_cus = uvg_count_chroma_tree_available_edge_cus(cu_loc->x >> 1, cu_loc->y >> 1, width, height, lcu, false);
+      px_available_top = num_cus * 4;
+    }
   }
 
   // Limit the number of available pixels based on block size and dimensions

From fb146cb6edd3307726d7400a39410ba663b45f80 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Tue, 29 Nov 2022 12:51:34 +0200
Subject: [PATCH 129/254] [mtt] proper split availability checking for split
 flag

---
 src/cu.c                 |  2 +-
 src/encode_coding_tree.c | 77 +++++++++-------------------------------
 2 files changed, 18 insertions(+), 61 deletions(-)

diff --git a/src/cu.c b/src/cu.c
index fcbdc15d..1e349bc7 100644
--- a/src/cu.c
+++ b/src/cu.c
@@ -440,7 +440,7 @@ int uvg_get_possible_splits(const encoder_state_t * const state,
   {
     splits[BT_HOR_SPLIT] = splits[TT_HOR_SPLIT] = splits[BT_VER_SPLIT] = splits[TT_VER_SPLIT] = false;
 
-    return;
+    return 0;
   }
 
   if (width > max_bt_size || height > max_bt_size)
diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index 12f455f2..ed2e8468 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -1206,66 +1206,26 @@ uint8_t uvg_write_split_flag(
   enum uvg_tree_type tree_type,
   double* bits_out)
 {
-  uint16_t abs_x = cu_loc->x + (state->tile->offset_x >> (tree_type == UVG_CHROMA_T));
-  uint16_t abs_y = cu_loc->y + (state->tile->offset_y >> (tree_type == UVG_CHROMA_T));
   double bits = 0;
-  const encoder_control_t* const ctrl = state->encoder_control;
   // Implisit split flag when on border
   // Exception made in VVC with flag not being implicit if the BT can be used for
   // horizontal or vertical split, then this flag tells if QT or BT is used
-  const int slice_type = state->frame->is_irap ? (tree_type == UVG_CHROMA_T ? 2 : 0) : 1;
-
-  bool no_split, allow_qt, bh_split, bv_split, th_split, tv_split;
-  no_split = allow_qt = bh_split = bv_split = th_split = tv_split = true;
-  
   const int cu_width = tree_type != UVG_CHROMA_T ? cu_loc->width : cu_loc->chroma_width;
   const int cu_height = tree_type != UVG_CHROMA_T ? cu_loc->height : cu_loc->chroma_height;
-  if (cu_width == state->encoder_control->cfg.min_qt_size[slice_type] || split_tree.mtt_depth > 0) allow_qt = false;
-  bool allow_btt = state->encoder_control->cfg.max_btt_depth[slice_type] > split_tree.mtt_depth && cu_width <= 64;
 
-  uint8_t implicit_split_mode = UVG_NO_SPLIT;
-  //bool implicit_split = border;
-  bool bottom_left_available = ((abs_y + cu_height - 1) < (ctrl->in.height >> (tree_type == UVG_CHROMA_T)));
-  bool top_right_available = ((abs_x + cu_width - 1) < (ctrl->in.width >> (tree_type == UVG_CHROMA_T)));
 
-  if (!bottom_left_available && !top_right_available && allow_qt) {
-    implicit_split_mode = QT_SPLIT;
-  }
-  else if (!bottom_left_available && allow_btt) {
-    implicit_split_mode = BT_HOR_SPLIT;
-  }
-  else if (!top_right_available && allow_btt) {
-    implicit_split_mode = BT_VER_SPLIT;
-  }
-  else if (!bottom_left_available || !top_right_available) {
-    implicit_split_mode = QT_SPLIT;
-  }
-  
-  // Check split conditions
-  if (implicit_split_mode != UVG_NO_SPLIT) {
-    no_split = th_split = tv_split = false;
-    bh_split = (implicit_split_mode == BT_HOR_SPLIT);
-    bv_split = (implicit_split_mode == BT_VER_SPLIT);
-  }
+  bool can_split[6];
+  const bool is_implicit = uvg_get_possible_splits(state, cu_loc, split_tree, tree_type, can_split);
 
-  if (!allow_btt) {
-    bh_split = bv_split = th_split = tv_split = false;
-  }
-  else {
-    bv_split &= cu_width <= state->encoder_control->cfg.max_bt_size[slice_type] && cu_width > state->encoder_control->cfg.min_qt_size[slice_type];
-    tv_split &= cu_width <= state->encoder_control->cfg.max_tt_size[slice_type] && cu_width > 2 * state->encoder_control->cfg.min_qt_size[slice_type];
-    bh_split &= cu_height <= state->encoder_control->cfg.max_bt_size[slice_type] && cu_height > state->encoder_control->cfg.min_qt_size[slice_type];
-    th_split &= cu_height <= state->encoder_control->cfg.max_tt_size[slice_type] && cu_height > 2 * state->encoder_control->cfg.min_qt_size[slice_type];
-  }
 
-  bool allow_split = allow_qt | bh_split | bv_split | th_split | tv_split;
+  bool allow_split = can_split[1] || can_split[2] || can_split[3] || can_split[4] || can_split[5];
 
   enum split_type split_flag = (split_tree.split_tree >> (split_tree.current_depth * 3)) & 7;
 
-  split_flag = implicit_split_mode != UVG_NO_SPLIT ? implicit_split_mode : split_flag;
+  split_flag = is_implicit ? (can_split[QT_SPLIT] ? QT_SPLIT : (can_split[BT_HOR_SPLIT] ? BT_HOR_SPLIT : BT_VER_SPLIT)) : split_flag;
 
   int split_model = 0;
-  if (no_split && allow_split) {
+  if (can_split[NO_SPLIT] && allow_split) {
     // Get left and top block split_flags and if they are present and true, increase model number
     if (left_cu && (1 << left_cu->log2_height) < cu_height) {
       split_model++;
@@ -1276,11 +1236,11 @@ uint8_t uvg_write_split_flag(
     }
 
     uint32_t split_num = 0;
-    if (allow_qt) split_num += 2;
-    if (bh_split) split_num++;
-    if (bv_split) split_num++;
-    if (th_split) split_num++;
-    if (tv_split) split_num++;
+    if (can_split[QT_SPLIT]) split_num += 2;
+    if (can_split[BT_HOR_SPLIT]) split_num++;
+    if (can_split[BT_VER_SPLIT]) split_num++;
+    if (can_split[TT_HOR_SPLIT]) split_num++;
+    if (can_split[TT_VER_SPLIT]) split_num++;
 
     if (split_num > 0) split_num--;
 
@@ -1292,9 +1252,9 @@ uint8_t uvg_write_split_flag(
   }
 
 
-  if (implicit_split_mode == UVG_NO_SPLIT && allow_qt && (bh_split || bv_split || th_split || tv_split) && split_flag != NO_SPLIT) {
+  if (!is_implicit && can_split[QT_SPLIT] && (can_split[BT_HOR_SPLIT] || can_split[BT_VER_SPLIT] || can_split[TT_HOR_SPLIT] || can_split[TT_VER_SPLIT]) && split_flag != NO_SPLIT) {
     bool qt_split = split_flag == QT_SPLIT;
-    if((bv_split || bh_split || tv_split || th_split) && allow_qt) {
+    if((can_split[BT_VER_SPLIT] || can_split[BT_HOR_SPLIT] || can_split[TT_VER_SPLIT] || can_split[TT_HOR_SPLIT]) && can_split[QT_SPLIT]) {
       unsigned left_qt_depth = 0;
       unsigned top_qt_depth = 0;
       if(left_cu) {
@@ -1312,11 +1272,11 @@ uint8_t uvg_write_split_flag(
     }
     if (!qt_split) {
       const bool is_vertical = split_flag == BT_VER_SPLIT || split_flag == TT_VER_SPLIT;
-      if((bh_split || th_split) && (bv_split || tv_split)) {
+      if((can_split[BT_HOR_SPLIT] || can_split[TT_HOR_SPLIT]) && (can_split[BT_VER_SPLIT] || can_split[TT_VER_SPLIT])) {
         split_model = 0;
-        if(bv_split + tv_split > bh_split + th_split) {
+        if(can_split[BT_VER_SPLIT] + can_split[TT_VER_SPLIT] > can_split[BT_HOR_SPLIT] + can_split[TT_HOR_SPLIT]) {
           split_model = 4;
-        } else if(bv_split + tv_split < bh_split + th_split) {
+        } else if(can_split[BT_VER_SPLIT] + can_split[TT_VER_SPLIT] < can_split[BT_HOR_SPLIT] + can_split[TT_HOR_SPLIT]) {
           split_model = 3;
         } else {
           const int d_a = cu_width / (above_cu ? (1 << above_cu->log2_width) : 1);
@@ -1327,7 +1287,7 @@ uint8_t uvg_write_split_flag(
         }
         CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.mtt_vertical_model[split_model]), is_vertical, bits, "mtt_vertical_flag");
       }
-      if ((bv_split && tv_split && is_vertical) || (bh_split && th_split && !is_vertical)) {
+      if ((can_split[BT_VER_SPLIT] && can_split[TT_VER_SPLIT] && is_vertical) || (can_split[BT_HOR_SPLIT] && can_split[TT_HOR_SPLIT] && !is_vertical)) {
         split_model = (2 * is_vertical) + (split_tree.mtt_depth <= 1);
         CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.mtt_binary_model[split_model]), 
           split_flag == BT_VER_SPLIT || split_flag == BT_HOR_SPLIT, bits, "mtt_binary_flag");
@@ -1432,10 +1392,7 @@ void uvg_encode_coding_tree(
   }
   
   DBG_YUVIEW_VALUE(state->frame->poc, DBG_YUVIEW_CU_TYPE, abs_x, abs_y, cu_width, cu_height, cur_cu->type-1);
-
-  if(tree_type==UVG_CHROMA_T)
-    fprintf(stderr, "%d %d %d %d\n", x * 2, y * 2, cu_width * 2, cu_height*2);
-
+  
   if (ctrl->cfg.lossless) {
     cabac->cur_ctx = &cabac->ctx.cu_transquant_bypass;
     CABAC_BIN(cabac, 1, "cu_transquant_bypass_flag");

From 9e644fafd03c073456b24f6ec12376fcb6648ece Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 30 Nov 2022 10:43:12 +0200
Subject: [PATCH 130/254] [mtt] search with mtt depth 2 and dual tree works
 without lfnst

---
 src/cfg.c                |  2 +-
 src/cu.c                 |  6 +++---
 src/cu.h                 |  2 +-
 src/encode_coding_tree.c | 20 +++++++++++++-------
 src/intra.c              | 22 +++++++++++++++++++++-
 src/intra.h              |  2 ++
 src/search.c             |  8 ++++----
 src/search_intra.c       |  4 ++--
 8 files changed, 47 insertions(+), 19 deletions(-)

diff --git a/src/cfg.c b/src/cfg.c
index 5ff05859..1bd6587d 100644
--- a/src/cfg.c
+++ b/src/cfg.c
@@ -227,7 +227,7 @@ int uvg_config_init(uvg_config *cfg)
   cfg->min_qt_size[1] = 4;
   cfg->min_qt_size[2] = 4;
 
-  cfg->max_btt_depth[0] = 1;
+  cfg->max_btt_depth[0] = 2;
   cfg->max_btt_depth[1] = 0;
   cfg->max_btt_depth[2] = 1;
 
diff --git a/src/cu.c b/src/cu.c
index 1e349bc7..0c8dd9f7 100644
--- a/src/cu.c
+++ b/src/cu.c
@@ -400,7 +400,7 @@ int uvg_get_possible_splits(const encoder_state_t * const state,
   bool can_btt = split_tree.mtt_depth < max_btd;
   
   const enum split_type last_split = (split_tree.split_tree >> (split_tree.current_depth * 3 - 3)) & 7;
-  const enum split_type parl_split = last_split == BT_HOR_SPLIT ? BT_HOR_SPLIT : BT_VER_SPLIT;
+  const enum split_type parl_split = last_split == TT_HOR_SPLIT ? BT_HOR_SPLIT : BT_VER_SPLIT;
 
   // don't allow QT-splitting below a BT split
   if (split_tree.current_depth != 0 && last_split != QT_SPLIT /* && !(width > 64 || height > 64)*/) splits[QT_SPLIT] = false;
@@ -459,12 +459,12 @@ int uvg_get_possible_splits(const encoder_state_t * const state,
 
   //if (modeType == MODE_TYPE_INTER && width * height == 32)  splits[BT_VER_SPLIT] = splits[BT_HOR_SPLIT] = false;
 
-  if (height <= 2 * min_tt_size || height > max_tt_size || width > max_tt_size)
+  if (cu_loc->chroma_height <= min_tt_size || height > max_tt_size || width > max_tt_size)
     splits[TT_HOR_SPLIT] = false;
   if (width > 64 || height > 64)  splits[TT_HOR_SPLIT] = false;
   if (tree_type == UVG_CHROMA_T && width * height <= 16 * 2)     splits[TT_HOR_SPLIT] = false;
 
-  if (width <= 2 * min_tt_size || width > max_tt_size || height > max_tt_size)
+  if (cu_loc->chroma_width <= min_tt_size || width > max_tt_size || height > max_tt_size)
     splits[TT_VER_SPLIT] = false;
   if (width > 64 || height > 64)  splits[TT_VER_SPLIT] = false;
   if (tree_type == UVG_CHROMA_T && (width * height <= 16 * 2 || width == 8))     splits[TT_VER_SPLIT] = false;
diff --git a/src/cu.h b/src/cu.h
index bfbb50a1..7722852d 100644
--- a/src/cu.h
+++ b/src/cu.h
@@ -607,7 +607,7 @@ static INLINE void cbf_copy(uint16_t *cbf, uint16_t src, color_t plane)
   *cbf |= src & (1 <<  plane);
 }
 
-#define GET_SPLITDATA(CU,curDepth) (((CU)->split_tree >> (curDepth)) & 7)
+#define GET_SPLITDATA(CU,curDepth) (((CU)->split_tree >> ((curDepth) * 3)) & 7)
 #define PU_IS_TU(cu) ((cu)->log2_width <= TR_MAX_LOG2_SIZE && (cu)->log2_height <= TR_MAX_LOG2_SIZE)
 
 #endif
diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index ed2e8468..b73a446c 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -563,6 +563,7 @@ static void encode_transform_unit(
                            (cu_info_t * )cur_pu,
                            NULL);
     }
+    if (tree_type == UVG_LUMA_T) return;
   }
 
   bool joint_chroma = cur_pu->joint_cb_cr != 0;
@@ -627,8 +628,9 @@ static void encode_transform_coeff(
     cur_tu = uvg_cu_array_at_const(used_array, x, y);
   }
 
-  const bool ver_split = cu_loc->height > TR_MAX_WIDTH;
-  const bool hor_split = cu_loc->width > TR_MAX_WIDTH;
+  const int tr_limit = (TR_MAX_WIDTH >> (tree_type == UVG_CHROMA_T));
+  const bool ver_split = cu_loc->height > tr_limit;
+  const bool hor_split = cu_loc->width > tr_limit;
 
   const int cb_flag_y = tree_type != UVG_CHROMA_T ? cbf_is_set(cur_tu->cbf, COLOR_Y) : 0;
   const int cb_flag_u = tree_type != UVG_LUMA_T ?(cur_tu->joint_cb_cr ? (cur_tu->joint_cb_cr >> 1) & 1 : cbf_is_set(cur_tu->cbf, COLOR_U)) : 0;
@@ -637,10 +639,10 @@ static void encode_transform_coeff(
 
   if (hor_split || ver_split) {
     enum split_type split;
-    if (cu_loc->width > TR_MAX_WIDTH && cu_loc->height > TR_MAX_WIDTH) {
+    if (cu_loc->width > tr_limit && cu_loc->height > tr_limit) {
       split = QT_SPLIT;
     }
-    else if (cu_loc->width > TR_MAX_WIDTH) {
+    else if (cu_loc->width > tr_limit) {
       split = BT_VER_SPLIT;
     }
     else {
@@ -650,6 +652,10 @@ static void encode_transform_coeff(
     cu_loc_t split_cu_loc[4];
     const int split_count = uvg_get_split_locs(cu_loc, split, split_cu_loc,NULL);
     for (int i = 0; i < split_count; ++i) {
+      if(tree_type == UVG_CHROMA_T) {
+        split_cu_loc[i].chroma_width = split_cu_loc[i].width;
+        split_cu_loc[i].chroma_height = split_cu_loc[i].height;
+      }
       encode_transform_coeff(state, &split_cu_loc[i], only_chroma,
         coeff, NULL, tree_type, true, false, luma_cbf_ctx, &split_cu_loc[i], &split_cu_loc[i]);
     }
@@ -1252,7 +1258,7 @@ uint8_t uvg_write_split_flag(
   }
 
 
-  if (!is_implicit && can_split[QT_SPLIT] && (can_split[BT_HOR_SPLIT] || can_split[BT_VER_SPLIT] || can_split[TT_HOR_SPLIT] || can_split[TT_VER_SPLIT]) && split_flag != NO_SPLIT) {
+  if (!is_implicit && (can_split[BT_HOR_SPLIT] || can_split[BT_VER_SPLIT] || can_split[TT_HOR_SPLIT] || can_split[TT_VER_SPLIT]) && split_flag != NO_SPLIT) {
     bool qt_split = split_flag == QT_SPLIT;
     if((can_split[BT_VER_SPLIT] || can_split[BT_HOR_SPLIT] || can_split[TT_VER_SPLIT] || can_split[TT_HOR_SPLIT]) && can_split[QT_SPLIT]) {
       unsigned left_qt_depth = 0;
@@ -1617,7 +1623,7 @@ void uvg_encode_coding_tree(
       has_chroma) || tree_type == UVG_CHROMA_T) &&
       tree_type != UVG_LUMA_T)   {
       int8_t luma_dir = uvg_get_co_located_luma_mode(tree_type != UVG_CHROMA_T ? chroma_loc : cu_loc, cu_loc, cur_cu, NULL, frame->cu_array, UVG_CHROMA_T);
-      encode_chroma_intra_cu(cabac, cur_cu, state->encoder_control->cfg.cclm, luma_dir,NULL);
+      encode_chroma_intra_cu(cabac, cur_cu, state->encoder_control->cfg.cclm && uvg_cclm_is_allowed(state, cu_loc, cur_cu, tree_type), luma_dir,NULL);
       // LFNST constraints must be reset here. Otherwise the left over values will interfere when calculating new constraints
       cu_info_t* tmp = (cu_info_t*)cur_cu;
       tmp->violates_lfnst_constrained_luma = false;
@@ -1769,7 +1775,7 @@ double uvg_mock_encode_coding_unit(
       int8_t luma_dir = uvg_get_co_located_luma_mode(chroma_loc,cu_loc , cur_cu, tree_type != UVG_CHROMA_T ? lcu : NULL,
               tree_type == UVG_CHROMA_T ? state->tile->frame->cu_array : NULL,
               is_separate_tree ? UVG_CHROMA_T : tree_type);
-      encode_chroma_intra_cu(cabac, cur_cu, state->encoder_control->cfg.cclm, luma_dir, &bits);
+      encode_chroma_intra_cu(cabac, cur_cu, state->encoder_control->cfg.cclm && uvg_cclm_is_allowed(state, cu_loc, cur_cu, tree_type), luma_dir, &bits);
     }
   }
   else {
diff --git a/src/intra.c b/src/intra.c
index 1dd74cb3..09c66c13 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -283,6 +283,26 @@ static void intra_pred_dc(
 }
 
 
+bool uvg_cclm_is_allowed(const encoder_state_t* const state, const cu_loc_t * const luma_loc, cu_info_t const * const cur_cu, enum
+                         uvg_tree_type tree_type)
+{
+  if (tree_type != UVG_CHROMA_T) {
+    return true;
+  }
+  uint32_t chroma_split = GET_SPLITDATA(cur_cu, 0);
+  if((chroma_split == BT_VER_SPLIT || chroma_split == TT_VER_SPLIT || chroma_split == TT_HOR_SPLIT) && GET_SPLITDATA(cur_cu, 1) == NO_SPLIT) return false;
+  const cu_info_t* const luma_cu = uvg_cu_array_at_const(state->tile->frame->cu_array, luma_loc->x, luma_loc->y);
+  uint32_t split = GET_SPLITDATA(luma_cu, 0);
+  if (split != QT_SPLIT && split != NO_SPLIT) {
+    return false;
+  }
+  if (split != NO_SPLIT && luma_cu->intra.isp_mode != ISP_MODE_NO_ISP) {
+    return false;
+  }
+  return true;
+}
+
+
 enum lm_mode
 {
   LM_CHROMA_IDX = 81,
@@ -1846,7 +1866,7 @@ void uvg_intra_recon_cu(
   const uint8_t depth = 6 - uvg_g_convert_to_log2[cu_loc->width];
   const vector2d_t lcu_px = { cu_loc->local_x >> (tree_type == UVG_CHROMA_T), cu_loc->local_y >> (tree_type == UVG_CHROMA_T) };
   const int8_t width = cu_loc->width;
-  const int8_t height = cu_loc->height; // TODO: height for non-square blocks.
+  const int8_t height = cu_loc->height;
   if (cur_cu == NULL) {
     cur_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y);
   }
diff --git a/src/intra.h b/src/intra.h
index fc81e645..5d7d84e7 100644
--- a/src/intra.h
+++ b/src/intra.h
@@ -158,6 +158,8 @@ int8_t uvg_get_co_located_luma_mode(
   const lcu_t* const lcu,
   const cu_array_t* const cu_array,
   enum uvg_tree_type tree_type);
+bool uvg_cclm_is_allowed(const encoder_state_t* const state, const cu_loc_t* const luma_loc, cu_info_t const* const cur_cu, enum
+                         uvg_tree_type tree_type);
 
 uint8_t uvg_get_mip_flag_context(
   const cu_loc_t* const cu_loc,
diff --git a/src/search.c b/src/search.c
index 3c76dc93..9546cc0a 100644
--- a/src/search.c
+++ b/src/search.c
@@ -215,7 +215,7 @@ static void work_tree_copy_up(
   copy_cu_info  (from, to, cu_loc, tree_type);
   copy_cu_pixels(from, to, cu_loc, cu_loc != chroma_loc && tree_type == UVG_LUMA_T ? UVG_LUMA_T : tree_type);
   copy_cu_coeffs(cu_loc, from, to, joint, cu_loc != chroma_loc && tree_type == UVG_LUMA_T ? UVG_LUMA_T : tree_type);
-  if (cu_loc != chroma_loc && tree_type == UVG_LUMA_T) {
+  if (cu_loc != chroma_loc && tree_type != UVG_LUMA_T) {
     copy_cu_pixels(from, to, chroma_loc, UVG_CHROMA_T);
     copy_cu_coeffs(chroma_loc, from, to, joint, UVG_CHROMA_T);
   }
@@ -1189,7 +1189,7 @@ static double search_cu(
 
       bool recon_chroma = true;
       bool recon_luma = tree_type != UVG_CHROMA_T;
-      if (is_separate_tree || !has_chroma || state->encoder_control->chroma_format == UVG_CSP_400 || tree_type == UVG_LUMA_T) {
+      if (is_separate_tree || !has_chroma || state->encoder_control->chroma_format == UVG_CSP_400 || tree_type == UVG_LUMA_T || cu_loc->chroma_height % 4 == 2) {
         recon_chroma = false; 
       }
       lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_height, cur_cu);
@@ -1200,7 +1200,7 @@ static double search_cu(
                          recon_luma, recon_chroma);
 
 
-      if((is_separate_tree && has_chroma && tree_type != UVG_LUMA_T && state->encoder_control->chroma_format != UVG_CSP_400 ) 
+      if((!recon_chroma && state->encoder_control->chroma_format != UVG_CSP_400 ) 
         || tree_type == UVG_CHROMA_T) {
         intra_search.pred_cu.intra.mode_chroma = cur_cu->intra.mode_chroma;
         uvg_intra_recon_cu(state,
@@ -1360,7 +1360,7 @@ static double search_cu(
     cabac_data_t best_split_cabac;
     memcpy(&post_seach_cabac, &state->search_cabac, sizeof(post_seach_cabac));
     for (int split_type = QT_SPLIT; split_type <= TT_VER_SPLIT; ++split_type) {
-      if (!can_split[split_type] || (split_type != QT_SPLIT && depth == 0) || (split_type == QT_SPLIT && depth == 1)) continue;
+      if (!can_split[split_type] || (tree_type == UVG_CHROMA_T && split_type == TT_HOR_SPLIT && cu_loc->chroma_height == 8)) continue;
       split_tree_t new_split = {
         split_tree.split_tree | split_type << (split_tree.current_depth * 3),
         split_tree.current_depth + 1,
diff --git a/src/search_intra.c b/src/search_intra.c
index 2a406076..4ee36f95 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -1476,7 +1476,7 @@ int8_t uvg_search_intra_chroma_rdo(
         }
         pred_cu->cr_lfnst_idx = lfnst;
         chroma_data[mode_i].lfnst_costs[lfnst] += mode_bits * state->lambda;
-        if (PU_IS_TU(pred_cu)) {
+        if (PU_IS_TU(pred_cu) && (tree_type != UVG_CHROMA_T || (pred_cu->log2_width < 5 && pred_cu->log2_height < 5))) {
           uvg_intra_predict(
             state,
             &refs[COLOR_U - 1],
@@ -1594,7 +1594,7 @@ int8_t uvg_search_cu_intra_chroma(
   const cu_info_t *cur_pu = &search_data->pred_cu;
   
   int8_t modes[8] = { 0, 50, 18, 1, luma_mode, 81, 82, 83 };
-  uint8_t total_modes = (state->encoder_control->cfg.cclm ? 8 : 5);
+  uint8_t total_modes = (state->encoder_control->cfg.cclm && uvg_cclm_is_allowed(state, cu_loc, cur_pu, tree_type) ? 8 : 5);
   for(int i = 0; i < 4; i++) {
     if (modes[i] == luma_mode) {
       modes[i] = 66;

From 065eb6fc030b470a70dbb55eafd149262907fe36 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 30 Nov 2022 13:26:48 +0200
Subject: [PATCH 131/254] [mtt] fix lfnst

---
 src/encode_coding_tree.c               |  2 +-
 src/encoderstate.h                     |  7 ++++++-
 src/search.c                           |  1 +
 src/strategies/avx2/quant-avx2.c       |  4 ++--
 src/strategies/generic/quant-generic.c | 10 +++++-----
 src/transform.c                        | 22 ++++++++++++----------
 src/transform.h                        |  6 ++++--
 7 files changed, 31 insertions(+), 21 deletions(-)

diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index b73a446c..01806ba8 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -124,7 +124,7 @@ bool uvg_is_lfnst_allowed(
     if ((isp_mode && !uvg_can_use_isp_with_lfnst(cu_width, cu_height, isp_mode, tree_type)) ||
       (pred_cu->type == CU_INTRA && mip_flag && !can_use_lfnst_with_mip) || 
       (is_sep_tree && MIN(cu_width, cu_height) < 4) || 
-      (cu_width > TR_MAX_WIDTH || cu_height > TR_MAX_WIDTH)) {
+      (cu_width > (TR_MAX_WIDTH >> (tree_type == UVG_CHROMA_T)) || cu_height > (TR_MAX_WIDTH >> (tree_type == UVG_CHROMA_T)))) {
       return false;
     }
     bool luma_flag = tree_type != UVG_CHROMA_T;
diff --git a/src/encoderstate.h b/src/encoderstate.h
index 6cad3e36..6df843d7 100644
--- a/src/encoderstate.h
+++ b/src/encoderstate.h
@@ -359,7 +359,12 @@ typedef struct encoder_state_t {
   //Constraint structure  
   void * constraint;
 
-
+  // Since lfnst needs the collocated luma intra mode for
+  // dual tree if the chroma mode is cclm mode and getting all of
+  // the information that would be necessary to get the collocated
+  // luma mode in the lfnst functions, instead store the current
+  // collocated luma mode in the state.
+  int8_t collocated_luma_mode;
 } encoder_state_t;
 
 void uvg_encode_one_frame(encoder_state_t * const state, uvg_picture* frame);
diff --git a/src/search.c b/src/search.c
index 9546cc0a..c17e7821 100644
--- a/src/search.c
+++ b/src/search.c
@@ -1111,6 +1111,7 @@ static double search_cu(
                     chroma_loc, cu_loc, &intra_search.pred_cu, is_separate_tree ? lcu : NULL,
                     tree_type == UVG_CHROMA_T ? state->tile->frame->cu_array : NULL,
                     UVG_CHROMA_T);
+            state->collocated_luma_mode = intra_mode;
             intra_search.pred_cu.type = CU_INTRA;
           } else  if (intra_search.pred_cu.intra.mip_flag) {
             intra_mode = 0;
diff --git a/src/strategies/avx2/quant-avx2.c b/src/strategies/avx2/quant-avx2.c
index a7e0f2f6..d49b2f8f 100644
--- a/src/strategies/avx2/quant-avx2.c
+++ b/src/strategies/avx2/quant-avx2.c
@@ -703,7 +703,7 @@ int uvg_quantize_residual_avx2(encoder_state_t *const state,
 
   if (state->encoder_control->cfg.lfnst && cur_cu->type == CU_INTRA) {
     // Forward low frequency non-separable transform
-    uvg_fwd_lfnst(cur_cu, width, height, color, lfnst_index, coeff, tree_type);
+    uvg_fwd_lfnst(cur_cu, width, height, color, lfnst_index, coeff, tree_type, state->collocated_luma_mode);
   }
 
   // Quantize coeffs. (coeff -> coeff_out)
@@ -739,7 +739,7 @@ int uvg_quantize_residual_avx2(encoder_state_t *const state,
 
     if (state->encoder_control->cfg.lfnst && cur_cu->type == CU_INTRA) {
       // Inverse low frequency non-separable transform
-      uvg_inv_lfnst(cur_cu, width, height, color, lfnst_index, coeff, tree_type);
+      uvg_inv_lfnst(cur_cu, width, height, color, lfnst_index, coeff, tree_type, state->collocated_luma_mode);
     }
     if (use_trskip) {
       uvg_itransformskip(state->encoder_control, residual, coeff, width, height);
diff --git a/src/strategies/generic/quant-generic.c b/src/strategies/generic/quant-generic.c
index 04a668f3..8c5649dc 100644
--- a/src/strategies/generic/quant-generic.c
+++ b/src/strategies/generic/quant-generic.c
@@ -314,7 +314,7 @@ int uvg_quant_cbcr_residual_generic(
 
   uvg_transform2d(state->encoder_control, combined_residual, coeff, width, height, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U, cur_cu);
   if(cur_cu->cr_lfnst_idx) {
-    uvg_fwd_lfnst(cur_cu, width, height, COLOR_UV, cur_cu->cr_lfnst_idx, coeff, tree_type);
+    uvg_fwd_lfnst(cur_cu, width, height, COLOR_UV, cur_cu->cr_lfnst_idx, coeff, tree_type, state->collocated_luma_mode);
   }
 
   if (state->encoder_control->cfg.rdoq_enable &&
@@ -329,7 +329,7 @@ int uvg_quant_cbcr_residual_generic(
   }
   else {
     uvg_quant(state, coeff, coeff_out, width, height, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U,
-      scan_order, cur_cu->type, cur_cu->tr_idx == MTS_SKIP && false, cur_cu->lfnst_idx);
+      scan_order, cur_cu->type, cur_cu->tr_idx == MTS_SKIP && false, cur_cu->cr_lfnst_idx);
   }
 
   int8_t has_coeffs = 0;
@@ -349,7 +349,7 @@ int uvg_quant_cbcr_residual_generic(
     uvg_dequant(state, coeff_out, coeff, width, height, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U,
       cur_cu->type, cur_cu->tr_idx == MTS_SKIP && false);
     if (cur_cu->cr_lfnst_idx) {
-      uvg_inv_lfnst(cur_cu, width, height, COLOR_UV, cur_cu->cr_lfnst_idx, coeff, tree_type);
+      uvg_inv_lfnst(cur_cu, width, height, COLOR_UV, cur_cu->cr_lfnst_idx, coeff, tree_type, state->collocated_luma_mode);
     }
     
     uvg_itransform2d(state->encoder_control, combined_residual, coeff, width, height, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U, cur_cu);
@@ -491,7 +491,7 @@ int uvg_quantize_residual_generic(encoder_state_t *const state,
 
   if (state->encoder_control->cfg.lfnst && cur_cu->type == CU_INTRA) {
     // Forward low frequency non-separable transform
-    uvg_fwd_lfnst(cur_cu, width, height, color, lfnst_index, coeff, tree_type);
+    uvg_fwd_lfnst(cur_cu, width, height, color, lfnst_index, coeff, tree_type, state->collocated_luma_mode);
   }
   
 
@@ -533,7 +533,7 @@ int uvg_quantize_residual_generic(encoder_state_t *const state,
     
     if (state->encoder_control->cfg.lfnst && cur_cu->type == CU_INTRA) {
       // Inverse low frequency non-separable transform
-      uvg_inv_lfnst(cur_cu, width, height, color, lfnst_index, coeff, tree_type);
+      uvg_inv_lfnst(cur_cu, width, height, color, lfnst_index, coeff, tree_type, state->collocated_luma_mode);
     }
     if (use_trskip) {
       uvg_itransformskip(state->encoder_control, residual, coeff, width, height);
diff --git a/src/transform.c b/src/transform.c
index 7e2b64ee..54ec2ecd 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -554,9 +554,9 @@ void uvg_chroma_transform_search(
     bool v_has_coeffs = false;
     bool is_jccr = IS_JCCR_MODE(transforms[i]);
     if(pred_cu->cr_lfnst_idx) {
-      uvg_fwd_lfnst(pred_cu, width, height, COLOR_U, pred_cu->cr_lfnst_idx, &u_coeff[i * trans_offset], tree_type);
+      uvg_fwd_lfnst(pred_cu, width, height, COLOR_U, pred_cu->cr_lfnst_idx, &u_coeff[i * trans_offset], tree_type, state->collocated_luma_mode);
       if (!is_jccr) {
-        uvg_fwd_lfnst(pred_cu, width, height, COLOR_V, pred_cu->cr_lfnst_idx, &v_coeff[i * trans_offset], tree_type);
+        uvg_fwd_lfnst(pred_cu, width, height, COLOR_V, pred_cu->cr_lfnst_idx, &v_coeff[i * trans_offset], tree_type, state->collocated_luma_mode);
       }
     }
     quantize_chroma(
@@ -572,7 +572,7 @@ void uvg_chroma_transform_search(
       &u_has_coeffs,
       &v_has_coeffs,
       pred_cu->cr_lfnst_idx);
-      if(pred_cu->cr_lfnst_idx !=0 && !u_has_coeffs && !v_has_coeffs) continue;
+    if(pred_cu->cr_lfnst_idx !=0 && !u_has_coeffs && !v_has_coeffs) continue;
     
     if(pred_cu->type == CU_INTRA && transforms[i] != CHROMA_TS && tree_type == UVG_CHROMA_T) {
       bool constraints[2] = { false, false };
@@ -591,7 +591,7 @@ void uvg_chroma_transform_search(
 
       if (transforms[i] != CHROMA_TS) {
         if (pred_cu->cr_lfnst_idx) {
-          uvg_inv_lfnst(pred_cu, width, height, COLOR_U, pred_cu->cr_lfnst_idx, &u_coeff[i * trans_offset], tree_type);
+          uvg_inv_lfnst(pred_cu, width, height, COLOR_U, pred_cu->cr_lfnst_idx, &u_coeff[i * trans_offset], tree_type, state->collocated_luma_mode);
         }
         uvg_itransform2d(state->encoder_control, u_recon_resi, &u_coeff[i * trans_offset], width, height,
           transforms[i] != JCCR_1 ? COLOR_U : COLOR_V, pred_cu);
@@ -622,7 +622,7 @@ void uvg_chroma_transform_search(
 
       if (transforms[i] != CHROMA_TS) {
         if (pred_cu->cr_lfnst_idx) {
-          uvg_inv_lfnst(pred_cu, width, height, COLOR_V, pred_cu->cr_lfnst_idx, &v_coeff[i * trans_offset], tree_type);
+          uvg_inv_lfnst(pred_cu, width, height, COLOR_V, pred_cu->cr_lfnst_idx, &v_coeff[i * trans_offset], tree_type, state->collocated_luma_mode);
         }
         uvg_itransform2d(state->encoder_control, v_recon_resi, &v_coeff[i * trans_offset], width, height,
           transforms[i] != JCCR_1 ? COLOR_U : COLOR_V, pred_cu);
@@ -856,7 +856,8 @@ void uvg_fwd_lfnst(
   const color_t color,
   const uint16_t lfnst_idx,
   coeff_t *coeffs,
-  enum uvg_tree_type tree_type)
+  enum uvg_tree_type tree_type,
+  int8_t luma_mode)
 {
   const uint16_t lfnst_index = lfnst_idx;
   const uint32_t log2_width = uvg_g_convert_to_log2[width];
@@ -879,7 +880,7 @@ void uvg_fwd_lfnst(
     const uint32_t* scan = whge3 ? uvg_coef_top_left_diag_scan_8x8[log2_width] : uvg_g_sig_last_scan[scan_order][log2_width - 1];
 
     if (is_cclm_mode) {
-      intra_mode = cur_cu->intra.mode;
+      intra_mode = luma_mode;
     }
     if (is_mip && color == COLOR_Y) {
       intra_mode = 0; // Set to planar mode
@@ -989,7 +990,8 @@ void uvg_inv_lfnst(
   const color_t color,
   const uint16_t lfnst_idx,
   coeff_t *coeffs,
-  enum uvg_tree_type tree_type)
+  enum uvg_tree_type tree_type,
+  int8_t luma_mode)
 {
   // In VTM, max log2 dynamic range is something in range [15, 20] depending on whether extended precision processing is enabled
   // Such is not yet present in uvg266 so use 15 for now
@@ -1010,7 +1012,7 @@ void uvg_inv_lfnst(
     const uint32_t* scan = whge3 ? uvg_coef_top_left_diag_scan_8x8[log2_width] : uvg_g_sig_last_scan[scan_order][log2_width - 1];
     
     if (is_cclm_mode) {
-      intra_mode = cur_cu->intra.mip_flag ? 0 : cur_cu->intra.mode;
+      intra_mode = luma_mode;
     }
     if (is_mip && color == COLOR_Y) {
       intra_mode = 0; // Set to planar mode
@@ -1299,7 +1301,7 @@ static void quantize_tr_residual(
         for (int j = 0; j < tr_height; ++j) {
           memcpy(&dst_coeff[j * lcu_width], &coeff[j * tr_width], tr_width * sizeof(coeff_t));
         }
-        cbf_set(&cur_pu->cbf, color);
+        cbf_set(&cur_pu->cbf, COLOR_U);
       }
       else {
         for (int j = 0; j < tr_height; ++j) {
diff --git a/src/transform.h b/src/transform.h
index e96a2893..d2b95ca8 100644
--- a/src/transform.h
+++ b/src/transform.h
@@ -131,7 +131,8 @@ void uvg_fwd_lfnst(
   const color_t color,
   const uint16_t lfnst_idx,
   coeff_t *coeffs,
-  enum uvg_tree_type tree_type);
+  enum uvg_tree_type tree_type,
+  int8_t luma_mode);
 
 void uvg_inv_lfnst(
   const cu_info_t* cur_cu,
@@ -140,6 +141,7 @@ void uvg_inv_lfnst(
   const color_t color,
   const uint16_t lfnst_idx,
   coeff_t* coeffs,
-  enum uvg_tree_type tree_type);
+  enum uvg_tree_type tree_type,
+  int8_t luma_mode);
 
 #endif

From 6a6bed7f1f0f42da77a79f116feabb295243f976 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Thu, 1 Dec 2022 14:38:35 +0200
Subject: [PATCH 132/254] [mtt] WIP

---
 src/cfg.c                | 70 ++++++++++++++++++++++++++++++++--------
 src/cli.c                | 27 +++++++++++++---
 src/cu.c                 | 17 +++++-----
 src/encode_coding_tree.c |  8 +++--
 src/global.h             |  6 ++--
 src/intra.c              | 13 ++++++--
 src/search.c             | 11 ++++---
 src/uvg266.h             |  8 ++---
 8 files changed, 118 insertions(+), 42 deletions(-)

diff --git a/src/cfg.c b/src/cfg.c
index 1bd6587d..a7555ddc 100644
--- a/src/cfg.c
+++ b/src/cfg.c
@@ -227,9 +227,9 @@ int uvg_config_init(uvg_config *cfg)
   cfg->min_qt_size[1] = 4;
   cfg->min_qt_size[2] = 4;
 
-  cfg->max_btt_depth[0] = 2;
+  cfg->max_btt_depth[0] = 0;
   cfg->max_btt_depth[1] = 0;
-  cfg->max_btt_depth[2] = 1;
+  cfg->max_btt_depth[2] = 0;
 
   cfg->max_tt_size[0] = 64;
   cfg->max_bt_size[0] = 64;
@@ -350,7 +350,7 @@ static int parse_tiles_specification(const char* const arg, int32_t * const ntil
 
   return 1;
 }
-/*
+
 static int parse_uint8(const char *numstr,uint8_t* number,int min, int max)
 {
   char *tail;
@@ -366,7 +366,7 @@ static int parse_uint8(const char *numstr,uint8_t* number,int min, int max)
     return 1;
   }
 }
-*/
+
 static int parse_int8(const char *numstr,int8_t* number,int min, int max)
 {
   char *tail;
@@ -382,7 +382,7 @@ static int parse_int8(const char *numstr,int8_t* number,int min, int max)
     return 1;
   }
 }
-/*
+
 static int parse_array(const char *array, uint8_t *coeff_key, int size,
                             int min, int max)
 {
@@ -406,15 +406,15 @@ static int parse_array(const char *array, uint8_t *coeff_key, int size,
     free(key);
     return 0;
   }
-  else if (i<size){
-    fprintf(stderr, "parsing failed : too few members.\n");
-    free(key);
-    return 0;
-  }
+  //else if (i<size){
+  //  fprintf(stderr, "parsing failed : too few members.\n");
+  //  free(key);
+  //  return 0;
+  //}
   free(key);
-  return 1;
+  return i;
 }
-*/
+
 
 static int parse_qp_scale_array(const char *array, int8_t *out)
 {
@@ -1497,6 +1497,49 @@ int uvg_config_parse(uvg_config *cfg, const char *name, const char *value)
   else if OPT("dual-tree") {
     cfg->dual_tree = atobool(value);
   }
+  else if OPT("mtt-depth-intra") {
+    cfg->max_btt_depth[0]  = atoi(value);
+  }
+  else if OPT("mtt-depth-intra-chroma") {
+    cfg->max_btt_depth[2]  = atoi(value);
+  }
+  else if OPT("mtt-depth-inter") {
+    cfg->max_btt_depth[1]  = atoi(value);
+  }
+  else if OPT("max-bt-size") {
+  uint8_t sizes[3];
+  const int got = parse_array(value, sizes, 3, 0, 128);
+    if (got == 1) {
+      cfg->max_bt_size[0] = sizes[0];
+      cfg->max_bt_size[1] = sizes[0];
+      cfg->max_bt_size[2] = sizes[0];
+    }
+    else if (got == 3) {
+      cfg->max_bt_size[0] = sizes[0];
+      cfg->max_bt_size[1] = sizes[1];
+      cfg->max_bt_size[2] = sizes[2];      
+    } else {
+      fprintf(stderr, "Incorrect amount of values provided for max-bt-size\n");
+      return 0;
+    }
+  }
+  else if OPT("max-tt-size") {
+  uint8_t sizes[3];
+  const int got = parse_array(value, sizes, 3, 0, 128);
+    if (got == 1) {
+      cfg->max_tt_size[0] = sizes[0];
+      cfg->max_tt_size[1] = sizes[0];
+      cfg->max_tt_size[2] = sizes[0];
+    }
+    else if (got == 3) {
+      cfg->max_tt_size[0] = sizes[0];
+      cfg->max_tt_size[1] = sizes[1];
+      cfg->max_tt_size[2] = sizes[2];      
+    } else {
+      fprintf(stderr, "Incorrect amount of values provided for max-tt-size\n");
+      return 0;
+    }
+  }
   else if OPT("intra-rough-granularity") {
     cfg->intra_rough_search_levels = atoi(value);
   }
@@ -1507,7 +1550,8 @@ int uvg_config_parse(uvg_config *cfg, const char *name, const char *value)
       return 0;
     }
     cfg->ibc = (uint8_t)ibc_value;
-  }  else {
+  }
+  else {
     return 0;
   }
 #undef OPT
diff --git a/src/cli.c b/src/cli.c
index e831e4ed..ab91e844 100644
--- a/src/cli.c
+++ b/src/cli.c
@@ -192,6 +192,11 @@ static const struct option long_options[] = {
   { "dual-tree",                no_argument, NULL, 0 },
   { "no-dual-tree",             no_argument, NULL, 0 },
   { "cabac-debug-file",   required_argument, NULL, 0 },
+  {"mtt-depth-intra",     required_argument, NULL, 0 },
+  {"mtt-depth-inter",     required_argument, NULL, 0 },
+  {"mtt-depth-intra-chroma", required_argument, NULL, 0 },
+  {"max_bt_size",         required_argument, NULL, 0 },
+  {"max_tt_size",         required_argument, NULL, 0 },
   { "intra-rough-granularity",required_argument, NULL, 0 },
   { "ibc",                required_argument, NULL, 0 },
   {0, 0, 0, 0}
@@ -603,14 +608,14 @@ void print_help(void)
     "                                   - 2: + 1/2-pixel diagonal\n"
     "                                   - 3: + 1/4-pixel horizontal and vertical\n"
     "                                   - 4: + 1/4-pixel diagonal\n"
-    "      --pu-depth-inter <int>-<int> : Inter prediction units sizes [0-3]\n"
-    "                                   - 0, 1, 2, 3: from 64x64 to 8x8\n"
+    "      --pu-depth-inter <int>-<int> : Maximum and minimum split depths where\n"
+    "                                     inter search is performed 0..8. [0-3]\n"
     "                                   - Accepts a list of values separated by ','\n"
     "                                     for setting separate depths per GOP layer\n"
     "                                     (values can be omitted to use the first\n"
     "                                     value for the respective layer).\n"
-    "      --pu-depth-intra <int>-<int> : Intra prediction units sizes [1-4]\n"
-    "                                   - 0, 1, 2, 3, 4: from 64x64 to 4x4\n"
+    "      --pu-depth-intra <int>-<int> : Maximum and minimum split depths where\n"
+    "                                     intra search is performed 0..8. [1-4]\n"
     "                                   - Accepts a list of values separated by ','\n"
     "                                     for setting separate depths per GOP layer\n"
     "                                     (values can be omitted to use the first\n"
@@ -618,6 +623,20 @@ void print_help(void)
     "      --ml-pu-depth-intra    : Predict the pu-depth-intra using machine\n"
     "                                learning trees, overrides the\n"
     "                                --pu-depth-intra parameter. [disabled]\n"
+    "      --mtt-depth-intra      : Depth of mtt for intra slices 0..3.[0]\n"
+    "      --mtt-depth-intra-chroma : Depth of mtt for chroma dual tree in\n"
+    "                                      intra slices 0..3.[0]\n"
+    "      --mtt-depth-inter      : Depth of mtt for inter slices 0..3.[0]\n"
+    "      --max-bt-size          : maximum size for a CU resulting from\n"
+    "                                   a bt split. A singular value shared for all\n"
+    "                                   or a list of three values for the different\n"
+    "                                   slices types (intra, inter, intra-chroma)\n"
+    "                                   can be provided. [64, 64, 32]\n"
+    "      --max-tt-size          : maximum size for a CU resulting from\n"
+    "                                   a tt split. A singular value shared for all\n"
+    "                                   or a list of three values for the different\n"
+    "                                   slices types (intra, inter, intra-chroma)\n"
+    "                                   can be provided. [64, 64, 32]\n"
     "      --intra-rough-granularity : How many levels are used for the\n"
     "                                   logarithmic intra rough search. 0..4\n"
     "                                   With 0 all of the modes are checked \n"
diff --git a/src/cu.c b/src/cu.c
index 0c8dd9f7..9908d43e 100644
--- a/src/cu.c
+++ b/src/cu.c
@@ -330,6 +330,7 @@ int uvg_get_split_locs(
   const int half_height = origin->height >> 1;
   const int quarter_width = origin->width >> 2;
   const int quarter_height = origin->height >> 2;
+  if (origin->width == 4 && separate_chroma) *separate_chroma = 1;
 
   switch (split) {
     case NO_SPLIT:
@@ -350,7 +351,7 @@ int uvg_get_split_locs(
     case BT_VER_SPLIT:
       uvg_cu_loc_ctor(&out[0], origin->x, origin->y, half_width, origin->height);
       uvg_cu_loc_ctor(&out[1], origin->x + half_width, origin->y, half_width, origin->height);
-      if (half_width == 4 && separate_chroma) *separate_chroma = 1;
+      if ((half_width == 4 || half_width * origin->height < 64) && separate_chroma) *separate_chroma = 1;
       return 2;
     case TT_HOR_SPLIT:
       uvg_cu_loc_ctor(&out[0], origin->x, origin->y, origin->width, quarter_height);
@@ -362,7 +363,7 @@ int uvg_get_split_locs(
       uvg_cu_loc_ctor(&out[0], origin->x, origin->y, quarter_width, origin->height);
       uvg_cu_loc_ctor(&out[1], origin->x + quarter_width, origin->y, half_width, origin->height);
       uvg_cu_loc_ctor(&out[2], origin->x + quarter_width + half_width, origin->y, quarter_width, origin->height);
-      if (quarter_width == 4 && separate_chroma) *separate_chroma = 1;
+      if ((quarter_width == 4 || quarter_width * origin->height < 64) && separate_chroma) *separate_chroma = 1;
       return 3;
   }
   return 0;
@@ -390,10 +391,10 @@ int uvg_get_possible_splits(const encoder_state_t * const state,
   const int slice_type = state->frame->is_irap ? (tree_type == UVG_CHROMA_T ? 2 : 0) : 1;
 
   const unsigned max_btd = state->encoder_control->cfg.max_btt_depth[slice_type]; // +currImplicitBtDepth;
-  const unsigned max_bt_size = state->encoder_control->cfg.max_bt_size[slice_type];
-  const unsigned min_bt_size = 1 << MIN_SIZE;
-  const unsigned max_tt_size = state->encoder_control->cfg.max_tt_size[slice_type];
-  const unsigned min_tt_size = 1 << MIN_SIZE;
+  const unsigned max_bt_size = state->encoder_control->cfg.max_bt_size[slice_type] >> (tree_type == UVG_CHROMA_T);
+  const unsigned min_bt_size = 1 << MIN_SIZE >> (tree_type == UVG_CHROMA_T);
+  const unsigned max_tt_size = state->encoder_control->cfg.max_tt_size[slice_type] >> (tree_type == UVG_CHROMA_T);
+  const unsigned min_tt_size = 1 << MIN_SIZE >> (tree_type == UVG_CHROMA_T);
   const unsigned min_qt_size = state->encoder_control->cfg.min_qt_size[slice_type];
   
   splits[NO_SPLIT] = splits[QT_SPLIT] = splits[BT_HOR_SPLIT] = splits[TT_HOR_SPLIT] = splits[BT_VER_SPLIT] = splits[TT_VER_SPLIT] = true;
@@ -459,12 +460,12 @@ int uvg_get_possible_splits(const encoder_state_t * const state,
 
   //if (modeType == MODE_TYPE_INTER && width * height == 32)  splits[BT_VER_SPLIT] = splits[BT_HOR_SPLIT] = false;
 
-  if (cu_loc->chroma_height <= min_tt_size || height > max_tt_size || width > max_tt_size)
+  if (height <= 2 * min_tt_size || height > max_tt_size || width > max_tt_size)
     splits[TT_HOR_SPLIT] = false;
   if (width > 64 || height > 64)  splits[TT_HOR_SPLIT] = false;
   if (tree_type == UVG_CHROMA_T && width * height <= 16 * 2)     splits[TT_HOR_SPLIT] = false;
 
-  if (cu_loc->chroma_width <= min_tt_size || width > max_tt_size || height > max_tt_size)
+  if (width <= 2 * min_tt_size || width > max_tt_size || height > max_tt_size)
     splits[TT_VER_SPLIT] = false;
   if (width > 64 || height > 64)  splits[TT_VER_SPLIT] = false;
   if (tree_type == UVG_CHROMA_T && (width * height <= 16 * 2 || width == 8))     splits[TT_VER_SPLIT] = false;
diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index 01806ba8..4f0c3f10 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -657,7 +657,7 @@ static void encode_transform_coeff(
         split_cu_loc[i].chroma_height = split_cu_loc[i].height;
       }
       encode_transform_coeff(state, &split_cu_loc[i], only_chroma,
-        coeff, NULL, tree_type, true, false, luma_cbf_ctx, &split_cu_loc[i], &split_cu_loc[i]);
+        coeff, NULL, tree_type, true, false, luma_cbf_ctx, &split_cu_loc[i], chroma_loc);
     }
     return;
   }
@@ -1391,14 +1391,16 @@ void uvg_encode_coding_tree(
         uvg_encode_coding_tree(state, coeff, tree_type,
           &new_cu_loc[split], 
           separate_chroma ? chroma_loc :(tree_type == UVG_CHROMA_T ? &chroma_tree_loc :  &new_cu_loc[split]),
-          new_split_tree, !separate_chroma || split == splits - 1);
+          new_split_tree, !separate_chroma || (split == splits - 1 && has_chroma));
       }
       return;
     }
   }
   
   DBG_YUVIEW_VALUE(state->frame->poc, DBG_YUVIEW_CU_TYPE, abs_x, abs_y, cu_width, cu_height, cur_cu->type-1);
-  
+
+  fprintf(stderr, "%4d %4d %2d %2d %d\n", x, y, cu_width, cu_height, has_chroma);
+
   if (ctrl->cfg.lossless) {
     cabac->cur_ctx = &cabac->ctx.cu_transquant_bypass;
     CABAC_BIN(cabac, 1, "cu_transquant_bypass_flag");
diff --git a/src/global.h b/src/global.h
index a6a109c5..c5b73c93 100644
--- a/src/global.h
+++ b/src/global.h
@@ -145,11 +145,11 @@ typedef int32_t mv_t;
 
 #define INTERNAL_MV_PREC 4 // Internal motion vector precision, 4 = 1/16 pel
 
-//! Limits for prediction block sizes. 0 = 64x64, 4 = 4x4.
+//! Limits for prediction block sizes. 
 #define PU_DEPTH_INTER_MIN 0
-#define PU_DEPTH_INTER_MAX 3
+#define PU_DEPTH_INTER_MAX 8
 #define PU_DEPTH_INTRA_MIN 0
-#define PU_DEPTH_INTRA_MAX 4
+#define PU_DEPTH_INTRA_MAX 8
 
 //! Maximum number of layers in GOP structure (for allocating structures)
 #define MAX_GOP_LAYERS 6
diff --git a/src/intra.c b/src/intra.c
index 09c66c13..439910f5 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -289,8 +289,15 @@ bool uvg_cclm_is_allowed(const encoder_state_t* const state, const cu_loc_t * co
   if (tree_type != UVG_CHROMA_T) {
     return true;
   }
-  uint32_t chroma_split = GET_SPLITDATA(cur_cu, 0);
-  if((chroma_split == BT_VER_SPLIT || chroma_split == TT_VER_SPLIT || chroma_split == TT_HOR_SPLIT) && GET_SPLITDATA(cur_cu, 1) == NO_SPLIT) return false;
+  uint32_t chroma_split_depth0 = GET_SPLITDATA(cur_cu, 0);
+  uint32_t chroma_split_depth1 = GET_SPLITDATA(cur_cu, 1);
+  bool allow = false;
+  if (chroma_split_depth0 == QT_SPLIT || (chroma_split_depth0 == BT_HOR_SPLIT && chroma_split_depth1 == BT_VER_SPLIT)) allow = true;
+  else if (chroma_split_depth0 == NO_SPLIT) allow = true;
+  else if (chroma_split_depth0 == BT_HOR_SPLIT && chroma_split_depth1 == NO_SPLIT) allow = true;
+  if (!allow) {
+    return false;
+  }
   const cu_info_t* const luma_cu = uvg_cu_array_at_const(state->tile->frame->cu_array, luma_loc->x, luma_loc->y);
   uint32_t split = GET_SPLITDATA(luma_cu, 0);
   if (split != QT_SPLIT && split != NO_SPLIT) {
@@ -318,7 +325,7 @@ static void get_cclm_parameters(
   uvg_intra_ref* luma_src, uvg_intra_references*chroma_ref,
   int16_t *a, int16_t*b, int16_t*shift) {
 
-  const int base_unit_size = 1 << (6 - PU_DEPTH_INTRA_MAX);
+  const int base_unit_size = 4;
 
   // TODO: take into account YUV422
   const int unit_w = base_unit_size >> 1;
diff --git a/src/search.c b/src/search.c
index c17e7821..f325e785 100644
--- a/src/search.c
+++ b/src/search.c
@@ -215,7 +215,7 @@ static void work_tree_copy_up(
   copy_cu_info  (from, to, cu_loc, tree_type);
   copy_cu_pixels(from, to, cu_loc, cu_loc != chroma_loc && tree_type == UVG_LUMA_T ? UVG_LUMA_T : tree_type);
   copy_cu_coeffs(cu_loc, from, to, joint, cu_loc != chroma_loc && tree_type == UVG_LUMA_T ? UVG_LUMA_T : tree_type);
-  if (cu_loc != chroma_loc && tree_type != UVG_LUMA_T) {
+  if (chroma_loc && tree_type != UVG_LUMA_T) {
     copy_cu_pixels(from, to, chroma_loc, UVG_CHROMA_T);
     copy_cu_coeffs(chroma_loc, from, to, joint, UVG_CHROMA_T);
   }
@@ -1201,7 +1201,7 @@ static double search_cu(
                          recon_luma, recon_chroma);
 
 
-      if((!recon_chroma && state->encoder_control->chroma_format != UVG_CSP_400 ) 
+      if((!recon_chroma && state->encoder_control->chroma_format != UVG_CSP_400 && tree_type != UVG_LUMA_T) 
         || tree_type == UVG_CHROMA_T) {
         intra_search.pred_cu.intra.mode_chroma = cur_cu->intra.mode_chroma;
         uvg_intra_recon_cu(state,
@@ -1361,7 +1361,10 @@ static double search_cu(
     cabac_data_t best_split_cabac;
     memcpy(&post_seach_cabac, &state->search_cabac, sizeof(post_seach_cabac));
     for (int split_type = QT_SPLIT; split_type <= TT_VER_SPLIT; ++split_type) {
-      if (!can_split[split_type] || (tree_type == UVG_CHROMA_T && split_type == TT_HOR_SPLIT && cu_loc->chroma_height == 8)) continue;
+      if (!can_split[split_type] 
+        || (tree_type == UVG_CHROMA_T && split_type == TT_HOR_SPLIT && cu_loc->chroma_height == 8)
+        || (tree_type == UVG_CHROMA_T && split_type == BT_HOR_SPLIT && cu_loc->chroma_height == 4))
+        continue;
       split_tree_t new_split = {
         split_tree.split_tree | split_type << (split_tree.current_depth * 3),
         split_tree.current_depth + 1,
@@ -1429,7 +1432,7 @@ static double search_cu(
             &new_cu_loc[split], separate_chroma ? chroma_loc : &new_cu_loc[split],
             &split_lcu[split_type -1], 
             tree_type, new_split,
-            !separate_chroma || split == splits - 1);
+            !separate_chroma || (split == splits - 1 && has_chroma));
           // If there is no separate chroma the block will always have chroma, otherwise it is the last block of the split that has the chroma
           if (split_cost > cost || split_cost > best_split_cost) {
             break;
diff --git a/src/uvg266.h b/src/uvg266.h
index 7d772780..fe6e2b0f 100644
--- a/src/uvg266.h
+++ b/src/uvg266.h
@@ -543,11 +543,11 @@ typedef struct uvg_config
 
   uint8_t dual_tree;
 
-  uint8_t min_qt_size[3];  /* intra, inter, dual tree chroma*/
-  uint8_t max_bt_size[3];
-  uint8_t max_tt_size[3];
+  uint8_t min_qt_size[3]; /* intra, inter, dual tree chroma*/
+  uint8_t max_bt_size[3]; /* intra, inter, dual tree chroma*/
+  uint8_t max_tt_size[3]; /* intra, inter, dual tree chroma*/
 
-  uint8_t max_btt_depth[3];
+  uint8_t max_btt_depth[3]; /* intra, inter, dual tree chroma*/
 
   uint8_t intra_rough_search_levels;
 

From b988c60dd12fe67400ea0fb6c8bc1b0662d0509a Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Fri, 2 Dec 2022 13:56:38 +0200
Subject: [PATCH 133/254] [mtt] search works completely with everything except
 RDOQ deblock and ISP

---
 src/encode_coding_tree.c |  9 +++++----
 src/search.c             | 41 ++++++++++++++++++++++++++++------------
 src/search_intra.c       |  3 ++-
 3 files changed, 36 insertions(+), 17 deletions(-)

diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index 4f0c3f10..ab79ee92 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -657,7 +657,7 @@ static void encode_transform_coeff(
         split_cu_loc[i].chroma_height = split_cu_loc[i].height;
       }
       encode_transform_coeff(state, &split_cu_loc[i], only_chroma,
-        coeff, NULL, tree_type, true, false, luma_cbf_ctx, &split_cu_loc[i], chroma_loc);
+        coeff, NULL, tree_type, true, false, luma_cbf_ctx, &split_cu_loc[i], chroma_loc ? &split_cu_loc[i] : NULL);
     }
     return;
   }
@@ -1399,7 +1399,7 @@ void uvg_encode_coding_tree(
   
   DBG_YUVIEW_VALUE(state->frame->poc, DBG_YUVIEW_CU_TYPE, abs_x, abs_y, cu_width, cu_height, cur_cu->type-1);
 
-  fprintf(stderr, "%4d %4d %2d %2d %d\n", x, y, cu_width, cu_height, has_chroma);
+  // fprintf(stderr, "%4d %4d %2d %2d %d\n", x, y, cu_width, cu_height, has_chroma);
 
   if (ctrl->cfg.lossless) {
     cabac->cur_ctx = &cabac->ctx.cu_transquant_bypass;
@@ -1616,8 +1616,9 @@ void uvg_encode_coding_tree(
 
     if (tree_type != UVG_CHROMA_T) {
       encode_lfnst_idx(state, cabac, cur_cu, is_local_dual_tree && state->encoder_control->chroma_format != UVG_CSP_400 ? UVG_LUMA_T : tree_type, COLOR_Y, cu_loc);
+
+      encode_mts_idx(state, cabac, cur_cu, cu_loc);
     }
-    encode_mts_idx(state, cabac, cur_cu, cu_loc);
 
     // For 4x4 the chroma PU/TU is coded after the last 
     if (state->encoder_control->chroma_format != UVG_CSP_400 &&
@@ -1777,7 +1778,7 @@ double uvg_mock_encode_coding_unit(
       int8_t luma_dir = uvg_get_co_located_luma_mode(chroma_loc,cu_loc , cur_cu, tree_type != UVG_CHROMA_T ? lcu : NULL,
               tree_type == UVG_CHROMA_T ? state->tile->frame->cu_array : NULL,
               is_separate_tree ? UVG_CHROMA_T : tree_type);
-      encode_chroma_intra_cu(cabac, cur_cu, state->encoder_control->cfg.cclm && uvg_cclm_is_allowed(state, cu_loc, cur_cu, tree_type), luma_dir, &bits);
+      encode_chroma_intra_cu(cabac, cur_cu, state->encoder_control->cfg.cclm && uvg_cclm_is_allowed(state, chroma_loc, cur_cu, tree_type), luma_dir, &bits);
     }
   }
   else {
diff --git a/src/search.c b/src/search.c
index f325e785..c5480a2e 100644
--- a/src/search.c
+++ b/src/search.c
@@ -76,7 +76,9 @@ static INLINE void copy_cu_info(lcu_t *from, lcu_t *to, const cu_loc_t* const cu
 }
 
 
-static INLINE void initialize_partial_work_tree(lcu_t* from, lcu_t *to, const cu_loc_t * const cu_loc, const enum uvg_tree_type tree_type) {
+static INLINE void initialize_partial_work_tree(lcu_t* from, lcu_t *to, const cu_loc_t * const cu_loc, const cu_loc_t* const
+                                                chroma_loc,
+                                                const enum uvg_tree_type tree_type) {
 
   const int y_limit = LCU_WIDTH >> (tree_type == UVG_CHROMA_T);
   const int x_limit = LCU_WIDTH >> (tree_type == UVG_CHROMA_T);
@@ -90,8 +92,8 @@ static INLINE void initialize_partial_work_tree(lcu_t* from, lcu_t *to, const cu
       uvg_pixels_blit(from->rec.y, to->rec.y, cu_loc->local_x, LCU_WIDTH, LCU_WIDTH, LCU_WIDTH);
     }
     if(tree_type != UVG_LUMA_T && from->ref.chroma_format != UVG_CSP_400) {
-      uvg_pixels_blit(from->rec.u, to->rec.u, cu_loc->local_x / 2, LCU_WIDTH_C, LCU_WIDTH_C, LCU_WIDTH_C);
-      uvg_pixels_blit(from->rec.v, to->rec.v, cu_loc->local_x / 2, LCU_WIDTH_C, LCU_WIDTH_C, LCU_WIDTH_C);
+      uvg_pixels_blit(from->rec.u, to->rec.u, chroma_loc->local_x / 2, LCU_WIDTH_C, LCU_WIDTH_C, LCU_WIDTH_C);
+      uvg_pixels_blit(from->rec.v, to->rec.v, chroma_loc->local_x / 2, LCU_WIDTH_C, LCU_WIDTH_C, LCU_WIDTH_C);
     }
   }
 
@@ -106,11 +108,11 @@ static INLINE void initialize_partial_work_tree(lcu_t* from, lcu_t *to, const cu
         LCU_WIDTH, LCU_WIDTH);
     }
     if (tree_type != UVG_LUMA_T && from->ref.chroma_format != UVG_CSP_400) {
-      uvg_pixels_blit(&from->rec.u[cu_loc->local_x / 2], &to->rec.u[cu_loc->local_x / 2],
-        LCU_WIDTH_C - cu_loc->local_x / 2, cu_loc->local_y / 2,
+      uvg_pixels_blit(&from->rec.u[chroma_loc->local_x / 2], &to->rec.u[chroma_loc->local_x / 2],
+        LCU_WIDTH_C - chroma_loc->local_x / 2, chroma_loc->local_y / 2,
         LCU_WIDTH_C, LCU_WIDTH_C);
-      uvg_pixels_blit(&from->rec.v[cu_loc->local_x / 2], &to->rec.v[cu_loc->local_x / 2],
-        LCU_WIDTH_C - cu_loc->local_x / 2, cu_loc->local_y / 2,
+      uvg_pixels_blit(&from->rec.v[chroma_loc->local_x / 2], &to->rec.v[chroma_loc->local_x / 2],
+        LCU_WIDTH_C - chroma_loc->local_x / 2, chroma_loc->local_y / 2,
         LCU_WIDTH_C, LCU_WIDTH_C);
     }
   }
@@ -129,16 +131,15 @@ static INLINE void initialize_partial_work_tree(lcu_t* from, lcu_t *to, const cu
   }
 
   if(tree_type != UVG_LUMA_T && from->ref.chroma_format != UVG_CSP_400) {
-    const int offset = cu_loc->local_x / 2 + cu_loc->local_y / 2 * LCU_WIDTH_C;
-    uvg_pixels_blit(&from->ref.u[offset], &to->ref.u[offset], cu_loc->chroma_width, cu_loc->chroma_height, LCU_WIDTH_C, LCU_WIDTH_C);
-    uvg_pixels_blit(&from->ref.v[offset], &to->ref.v[offset], cu_loc->chroma_width, cu_loc->chroma_height, LCU_WIDTH_C, LCU_WIDTH_C);
+    const int offset = chroma_loc->local_x / 2 + chroma_loc->local_y / 2 * LCU_WIDTH_C;
+    uvg_pixels_blit(&from->ref.u[offset], &to->ref.u[offset], chroma_loc->chroma_width, chroma_loc->chroma_height, LCU_WIDTH_C, LCU_WIDTH_C);
+    uvg_pixels_blit(&from->ref.v[offset], &to->ref.v[offset], chroma_loc->chroma_width, chroma_loc->chroma_height, LCU_WIDTH_C, LCU_WIDTH_C);
   } 
 
   const int y_start = (cu_loc->local_y >> (tree_type == UVG_CHROMA_T)) - 4;
   const int x_start = (cu_loc->local_x >> (tree_type == UVG_CHROMA_T)) - 4;
   for (int y = y_start; y < y_limit; y += SCU_WIDTH) {
     *LCU_GET_CU_AT_PX(to, x_start, y) = *LCU_GET_CU_AT_PX(from, x_start, y);
-
   }
   for (int x = x_start; x < x_limit; x += SCU_WIDTH) {
     *LCU_GET_CU_AT_PX(to, x, y_start) = *LCU_GET_CU_AT_PX(from, x, y_start);
@@ -149,6 +150,22 @@ static INLINE void initialize_partial_work_tree(lcu_t* from, lcu_t *to, const cu
       memset(LCU_GET_CU_AT_PX(to, x, y), 0, sizeof(cu_info_t));
     }
   }
+
+  if(chroma_loc->local_y != cu_loc->local_y || chroma_loc->local_x != cu_loc->local_x && tree_type == UVG_BOTH_T) {
+    const int y_start = (chroma_loc->local_y >> (tree_type == UVG_CHROMA_T)) - 4;
+    const int x_start = (chroma_loc->local_x >> (tree_type == UVG_CHROMA_T)) - 4;
+    for (int y = y_start; y < y_limit; y += SCU_WIDTH) {
+      *LCU_GET_CU_AT_PX(to, x_start, y) = *LCU_GET_CU_AT_PX(from, x_start, y);
+    }
+    if (chroma_loc->local_x == 0) {
+      to->left_ref = from->left_ref;
+      *LCU_GET_TOP_RIGHT_CU(to) = *LCU_GET_TOP_RIGHT_CU(from);      
+    }
+    if (chroma_loc->local_y == 0) {
+      to->top_ref = from->top_ref;
+      *LCU_GET_TOP_RIGHT_CU(to) = *LCU_GET_TOP_RIGHT_CU(from);      
+    }
+  }
 }
 
 static INLINE void copy_cu_pixels(
@@ -1425,7 +1442,7 @@ static double search_cu(
         cu_loc_t new_cu_loc[4];
         uint8_t separate_chroma = 0;
         const int splits = uvg_get_split_locs(cu_loc, split_type, new_cu_loc, &separate_chroma);
-        initialize_partial_work_tree(lcu, &split_lcu[split_type - 1], cu_loc, tree_type);
+        initialize_partial_work_tree(lcu, &split_lcu[split_type - 1], cu_loc, separate_chroma ? chroma_loc : cu_loc , tree_type);
         for (int split = 0; split < splits; ++split) {
           new_split.part_index = split;
           split_cost += search_cu(state, 
diff --git a/src/search_intra.c b/src/search_intra.c
index 4ee36f95..2856a7d4 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -377,7 +377,8 @@ static double search_intra_trdepth(
           pred_cu->mts_last_scan_pos = 0;
           pred_cu->violates_mts_coeff_constraint = 0;
 
-          if (trafo == MTS_SKIP && (width > (1 << state->encoder_control->cfg.trskip_max_size)
+          if (trafo == MTS_SKIP && ((width > (1 << state->encoder_control->cfg.trskip_max_size)
+            || (height > (1 << state->encoder_control->cfg.trskip_max_size)))
             || !state->encoder_control->cfg.trskip_enable)) {
             continue;
           }

From 2da1a34ff36a52d7c6f32df7f04fb9685a23f641 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Tue, 6 Dec 2022 11:23:30 +0200
Subject: [PATCH 134/254] [mtt] Fix isp for MTT

---
 src/encode_coding_tree.c               | 44 ++++++++++--
 src/encode_coding_tree.h               |  2 +-
 src/global.h                           |  4 +-
 src/intra.c                            | 95 +++++++++++++++-----------
 src/intra.h                            |  2 +-
 src/search.c                           |  8 +--
 src/search_intra.c                     | 16 +++--
 src/strategies/avx2/intra-avx2.c       |  3 +-
 src/strategies/generic/dct-generic.c   | 21 ++++--
 src/strategies/generic/intra-generic.c | 12 ++--
 src/strategies/generic/quant-generic.c | 15 ++--
 src/strategies/strategies-intra.h      |  3 +-
 src/transform.c                        | 25 +++++--
 13 files changed, 160 insertions(+), 90 deletions(-)

diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index ab79ee92..d3f4de29 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -111,7 +111,7 @@ bool uvg_is_lfnst_allowed(
   const cu_info_t* const pred_cu,
   enum uvg_tree_type tree_type,
   const color_t color,
-  const cu_loc_t* const cu_loc) 
+  const cu_loc_t* const cu_loc, const lcu_t* const lcu) 
 {
   if (state->encoder_control->cfg.lfnst && pred_cu->type == CU_INTRA && PU_IS_TU(pred_cu)) {
     const int isp_mode = pred_cu->intra.isp_mode;
@@ -121,22 +121,51 @@ bool uvg_is_lfnst_allowed(
     bool is_sep_tree = tree_type != UVG_BOTH_T;
     bool mip_flag = pred_cu->type == CU_INTRA && color == COLOR_Y ? pred_cu->intra.mip_flag : false;
 
-    if ((isp_mode && !uvg_can_use_isp_with_lfnst(cu_width, cu_height, isp_mode, tree_type)) ||
-      (pred_cu->type == CU_INTRA && mip_flag && !can_use_lfnst_with_mip) || 
+    if ((isp_mode && !uvg_can_use_isp_with_lfnst(cu_width, cu_height, isp_mode, tree_type) && color == COLOR_Y) ||
+      (pred_cu->type == CU_INTRA && mip_flag && !can_use_lfnst_with_mip && color == COLOR_Y) ||
       (is_sep_tree && MIN(cu_width, cu_height) < 4) || 
       (cu_width > (TR_MAX_WIDTH >> (tree_type == UVG_CHROMA_T)) || cu_height > (TR_MAX_WIDTH >> (tree_type == UVG_CHROMA_T)))) {
       return false;
     }
     bool luma_flag = tree_type != UVG_CHROMA_T;
     bool chroma_flag = tree_type != UVG_LUMA_T;
-    bool non_zero_coeff_non_ts_corner_8x8 = (luma_flag && pred_cu->violates_lfnst_constrained_luma) || (chroma_flag && pred_cu->violates_lfnst_constrained_chroma);
+    bool non_zero_coeff_non_ts_corner_8x8 = false;
+    bool last_scan_pos = false;
     bool is_tr_skip = false;
     
+    int split_num = color == COLOR_Y && isp_mode ? uvg_get_isp_split_num(cu_width, cu_height, isp_mode, false) : 0;
+    const videoframe_t* const frame = state->tile->frame;
+
+    if (split_num) {
+      // Constraints for ISP split blocks
+      for (int i = 0; i < split_num; ++i) {
+        cu_loc_t split_loc;
+        uvg_get_isp_split_loc(&split_loc, cu_loc->x, cu_loc->y, cu_width, cu_height, i, isp_mode, false);
+        int local_split_x = split_loc.x;
+        int local_split_y = split_loc.y;
+        uvg_get_isp_cu_arr_coords(&local_split_x, &local_split_y);
+        const cu_info_t* split_cu = lcu ? LCU_GET_CU_AT_PX(lcu, local_split_x, local_split_y) :
+          uvg_cu_array_at_const(frame->cu_array, local_split_x, local_split_y);
+
+        //if (cbf_is_set(split_cu->cbf, depth, COLOR_Y)) {
+        // ISP_TODO: remove this if clause altogether if it seems it is not needed
+        if (true) {
+          non_zero_coeff_non_ts_corner_8x8 |= (luma_flag && split_cu->violates_lfnst_constrained_luma) || (chroma_flag && split_cu->violates_lfnst_constrained_chroma);
+          //last_scan_pos |= split_cu->lfnst_last_scan_pos;
+          last_scan_pos |= true;
+        }
+      }
+    }
+    else {
+      non_zero_coeff_non_ts_corner_8x8 |= (luma_flag && pred_cu->violates_lfnst_constrained_luma) || (chroma_flag && pred_cu->violates_lfnst_constrained_chroma);
+      last_scan_pos |= pred_cu->lfnst_last_scan_pos;
+    }
+
     if (color == COLOR_Y && pred_cu->tr_idx == MTS_SKIP) {
       is_tr_skip = true;
     }
 
-    if ((!pred_cu->lfnst_last_scan_pos && !isp_mode) || non_zero_coeff_non_ts_corner_8x8 || is_tr_skip) {
+    if ((!last_scan_pos) || non_zero_coeff_non_ts_corner_8x8 || is_tr_skip) {
       return false;
     }
     return true;
@@ -155,7 +184,7 @@ static bool encode_lfnst_idx(
   const cu_loc_t* const cu_loc)
 {
   
-  if (uvg_is_lfnst_allowed(state, pred_cu, tree_type, color, cu_loc)) {
+  if (uvg_is_lfnst_allowed(state, pred_cu, tree_type, color, cu_loc, NULL)) {
     // Getting separate tree bool from block size is a temporary fix until a proper dual tree check is possible (there is no dual tree structure at time of writing this).
     // VTM seems to force explicit dual tree structure for small 4x4 blocks
     bool is_separate_tree = tree_type != UVG_BOTH_T;
@@ -1399,7 +1428,7 @@ void uvg_encode_coding_tree(
   
   DBG_YUVIEW_VALUE(state->frame->poc, DBG_YUVIEW_CU_TYPE, abs_x, abs_y, cu_width, cu_height, cur_cu->type-1);
 
-  // fprintf(stderr, "%4d %4d %2d %2d %d\n", x, y, cu_width, cu_height, has_chroma);
+  //fprintf(stderr, "%4d %4d %2d %2d %d\n", x, y, cu_width, cu_height, has_chroma);
 
   if (ctrl->cfg.lossless) {
     cabac->cur_ctx = &cabac->ctx.cu_transquant_bypass;
@@ -1611,6 +1640,7 @@ void uvg_encode_coding_tree(
         encode_transform_coeff(state, &split_loc,
           0, coeff, NULL, tree_type, last_split, can_skip_last_cbf, &luma_cbf_ctx, 
           cu_loc, is_local_dual_tree ? NULL : chroma_loc);
+        can_skip_last_cbf &= luma_cbf_ctx == 2;
       }
     }
 
diff --git a/src/encode_coding_tree.h b/src/encode_coding_tree.h
index 96e0cfb7..0e72369e 100644
--- a/src/encode_coding_tree.h
+++ b/src/encode_coding_tree.h
@@ -47,7 +47,7 @@ bool uvg_is_lfnst_allowed(
   const cu_info_t* const pred_cu,
   enum uvg_tree_type tree_type,
   const color_t color,
-  const cu_loc_t* const cu_loc);
+  const cu_loc_t* const cu_loc, const lcu_t* const lcu);
 
 void uvg_encode_coding_tree(
   encoder_state_t * const state,
diff --git a/src/global.h b/src/global.h
index c5b73c93..27058463 100644
--- a/src/global.h
+++ b/src/global.h
@@ -128,9 +128,9 @@ typedef int16_t coeff_t;
 
 typedef int32_t mv_t;
 
-#define VERBOSE 1
+//#define VERBOSE 1
 #define UVG_DEBUG_PRINT_CABAC 1
-#define UVG_DEBUG 1
+//#define UVG_DEBUG 1
 
 //#define UVG_DEBUG_PRINT_YUVIEW_CSV 1
 //#define UVG_DEBUG_PRINT_MV_INFO 1
diff --git a/src/intra.c b/src/intra.c
index 439910f5..429254c1 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -300,13 +300,13 @@ bool uvg_cclm_is_allowed(const encoder_state_t* const state, const cu_loc_t * co
   }
   const cu_info_t* const luma_cu = uvg_cu_array_at_const(state->tile->frame->cu_array, luma_loc->x, luma_loc->y);
   uint32_t split = GET_SPLITDATA(luma_cu, 0);
-  if (split != QT_SPLIT && split != NO_SPLIT) {
-    return false;
+  if (split != NO_SPLIT) {
+    allow = split == QT_SPLIT;
   }
-  if (split != NO_SPLIT && luma_cu->intra.isp_mode != ISP_MODE_NO_ISP) {
-    return false;
+  else if (split != NO_SPLIT && luma_cu->intra.isp_mode != ISP_MODE_NO_ISP) {
+    allow = false;
   }
-  return true;
+  return allow;
 }
 
 
@@ -943,11 +943,15 @@ static void mip_predict(
 }
 
 
-int8_t uvg_wide_angle_correction(int_fast8_t mode, const bool is_isp, const int log2_width, const int log2_height, const
-                                 bool account_for_dc_planar)
+int8_t uvg_wide_angle_correction(
+  int_fast8_t mode,
+  const int log2_width,
+  const int log2_height,
+  const
+  bool account_for_dc_planar)
 {
   int8_t pred_mode = mode;
-  if (!is_isp && log2_width != log2_height) {
+  if (log2_width != log2_height) {
     if (mode > 1 && mode <= 66) {
       const int modeShift[] = { 0, 6, 10, 12, 14, 15 };
       const int deltaSize = abs(log2_width - log2_height);
@@ -965,15 +969,17 @@ int8_t uvg_wide_angle_correction(int_fast8_t mode, const bool is_isp, const int
 static void intra_predict_regular(
   const encoder_state_t* const state,
   uvg_intra_references *refs,
+  const cu_info_t* const       cur_cu,
   const cu_loc_t* const cu_loc,
+  const cu_loc_t* const pu_loc,
   int_fast8_t mode,
   color_t color,
   uvg_pixel *dst,
   const uint8_t multi_ref_idx,
   const uint8_t isp_mode)
 {
-  const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
-  const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
+  const int width = color == COLOR_Y ? pu_loc->width : pu_loc->chroma_width;
+  const int height = color == COLOR_Y ? pu_loc->height : pu_loc->chroma_height;
   const int log2_width = uvg_g_convert_to_log2[width];
   const int log2_height = uvg_g_convert_to_log2[height];
   const uvg_config *cfg = &state->encoder_control->cfg;
@@ -983,11 +989,12 @@ static void intra_predict_regular(
   uint8_t isp = color == COLOR_Y ? isp_mode : 0;
 
   // Wide angle correction
-  int8_t pred_mode = uvg_wide_angle_correction(mode,
-                                               isp_mode,
-                                               log2_width,
-                                               log2_height,
-                                               false);
+  int8_t pred_mode = uvg_wide_angle_correction(
+    mode,
+    color == COLOR_Y ? cur_cu->log2_width : log2_width,
+    color == COLOR_Y ? cur_cu->log2_height : log2_height,
+    false
+    );
 
   const uvg_intra_ref *used_ref = &refs->ref;
   if (cfg->intra_smoothing_disabled || color != COLOR_Y || mode == 1 || (width == 4 && height == 4) || multi_ref_index || isp_mode /*ISP_TODO: replace this fake ISP check*/) {
@@ -1019,11 +1026,20 @@ static void intra_predict_regular(
   }
 
   if (mode == 0) {
-    uvg_intra_pred_planar(cu_loc, color, used_ref->top, used_ref->left, dst);
+    uvg_intra_pred_planar(pu_loc, color, used_ref->top, used_ref->left, dst);
   } else if (mode == 1) {
-    intra_pred_dc(cu_loc, color, used_ref->top, used_ref->left, dst, multi_ref_index);
+    intra_pred_dc(pu_loc, color, used_ref->top, used_ref->left, dst, multi_ref_index);
   } else {
-    uvg_angular_pred(cu_loc, pred_mode, color, used_ref->top, used_ref->left, dst, multi_ref_index, isp);
+    uvg_angular_pred(
+      pu_loc,
+      pred_mode,
+      color,
+      used_ref->top,
+      used_ref->left,
+      dst,
+      multi_ref_index,
+      isp,
+      isp_mode == ISP_MODE_HOR ? cu_loc->height : cu_loc->width);
   }
 
   // pdpc
@@ -1032,7 +1048,7 @@ static void intra_predict_regular(
   pdpcCondition &= width >= TR_MIN_WIDTH && height >= TR_MIN_WIDTH;
   if (pdpcCondition && multi_ref_index == 0) // Cannot be used with MRL.
   {
-    uvg_pdpc_planar_dc(mode, cu_loc, color, used_ref, dst);
+    uvg_pdpc_planar_dc(mode, pu_loc, color, used_ref, dst);
   }
 }
 
@@ -1065,7 +1081,7 @@ void uvg_intra_build_reference_any(
 
   bool is_first_isp_block = isp_mode ? pu_x == cu_x && pu_y == cu_y : false;
 
-  assert((log2_width >= 1 && log2_width <= 5) && (log2_height >= 1 && log2_height <= 5));
+  assert((log2_width >= 2 && log2_width <= 5) &&  log2_height <= 5);
 
   refs->filtered_initialized = false;
   uvg_pixel *out_left_ref = &refs->ref.left[0];
@@ -1138,11 +1154,8 @@ void uvg_intra_build_reference_any(
         px_available_left = height;
       }
       else {
-        px_available_left = num_ref_pixels_left[lcu_px.y / 4][lcu_px.x / 4];
-        // This table does not have values for dimensions less than 4
-        if (lcu_px.y % 4 != 0) {
-          px_available_left -= 2;
-        }
+        px_available_left = uvg_count_available_edge_cus(cu_loc, lcu, true) * 4;
+        px_available_left -= pu_loc->y - cu_loc->y;
       }
     }
     else {
@@ -1270,7 +1283,8 @@ void uvg_intra_build_reference_any(
         px_available_top = width;
       }
       else {
-        px_available_top = num_ref_pixels_top[lcu_px.y / 4][lcu_px.x / 4];
+      px_available_top = uvg_count_available_edge_cus(cu_loc, lcu, false) * 4;
+      px_available_top -= pu_loc->x - cu_loc->x;
       }
     }
     else {
@@ -1343,7 +1357,7 @@ void uvg_intra_build_reference_inner(
   bool is_first_isp_block = isp_mode ? pu_x == cu_x && pu_y == cu_y : false;
 
   // Log2_dim 1 is possible with ISP blocks
-  assert((log2_width >= 1 && log2_width <= 5) && (log2_height >= 1 && log2_height <= 5));
+  assert((log2_width >= 2 && log2_width <= 5) &&  log2_height <= 5);
 
   refs->filtered_initialized = false;
   uvg_pixel * __restrict out_left_ref = &refs->ref.left[0];
@@ -1457,11 +1471,8 @@ void uvg_intra_build_reference_inner(
       px_available_left = height;
     }
     else {
-      px_available_left = num_ref_pixels_left[lcu_px.y / 4][lcu_px.x / 4];
-      // This table does not have values for dimensions less than 4
-      if (lcu_px.y % 4 != 0) {
-        px_available_left -= 2;
-      }
+      px_available_left = uvg_count_available_edge_cus(cu_loc, lcu, true) * 4;
+      px_available_left -= pu_loc->y - cu_loc->y;
     }
 
   }
@@ -1477,7 +1488,7 @@ void uvg_intra_build_reference_inner(
 
   // Limit the number of available pixels based on block size and dimensions
   // of the picture.
-  px_available_left = MIN(px_available_left, height * 2);
+  px_available_left = MIN(px_available_left, cu_height * 2);
   px_available_left = MIN(px_available_left, (pic_px->y - luma_px->y) >> is_chroma);
 
   // Copy pixels from coded CUs.
@@ -1529,7 +1540,8 @@ void uvg_intra_build_reference_inner(
       px_available_top = width;
     }
     else {
-      px_available_top = num_ref_pixels_top[lcu_px.y / 4][lcu_px.x / 4];
+      px_available_top = uvg_count_available_edge_cus(cu_loc, lcu, false) * 4;
+      px_available_top -= pu_loc->x - cu_loc->x;
     }
   }
   else {
@@ -1603,6 +1615,7 @@ void uvg_intra_predict(
   const encoder_state_t* const state,
   uvg_intra_references* const refs,
   const cu_loc_t* const cu_loc,
+  const cu_loc_t* const pu_loc,
   const color_t color,
   uvg_pixel* dst,
   const intra_search_data_t* data,
@@ -1614,10 +1627,10 @@ void uvg_intra_predict(
   // TODO: what is this used for?
   // const bool filter_boundary = color == COLOR_Y && !(cfg->lossless && cfg->implicit_rdpcm);
   bool use_mip = false;
-  const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
-  const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
-  const int x = cu_loc->x;
-  const int y = cu_loc->y;
+  const int width = color == COLOR_Y ? pu_loc->width : pu_loc->chroma_width;
+  const int height = color == COLOR_Y ? pu_loc->height : pu_loc->chroma_height;
+  const int x = pu_loc->x;
+  const int y = pu_loc->y;
   int8_t intra_mode = color == COLOR_Y ? data->pred_cu.intra.mode : data->pred_cu.intra.mode_chroma;
   if (data->pred_cu.intra.mip_flag) {
     if (color == COLOR_Y) {
@@ -1633,7 +1646,7 @@ void uvg_intra_predict(
       mip_predict(state, refs, width, height, dst, intra_mode, data->pred_cu.intra.mip_is_transposed);
     }
     else {
-      intra_predict_regular(state, refs, cu_loc, intra_mode, color, dst, data->pred_cu.intra.multi_ref_idx, data->pred_cu.intra.isp_mode);
+      intra_predict_regular(state, refs, &data->pred_cu, cu_loc, pu_loc, intra_mode, color, dst, data->pred_cu.intra.multi_ref_idx, data->pred_cu.intra.isp_mode);
     }
   }
   else {
@@ -1748,7 +1761,7 @@ void uvg_get_isp_split_loc(cu_loc_t *loc, const int x, const int y, const int bl
   if (split_type != ISP_MODE_NO_ISP) {
     part_dim = uvg_get_isp_split_dim(block_w, block_h, split_type, is_transform_split);
   }
-  if(split_type == ISP_MODE_VER && block_w < 16 && !is_transform_split) {
+  if(split_type == ISP_MODE_VER && block_w < 16 && block_h != 4 && !is_transform_split) {
     split_idx /= 2;
   }
   const int offset = part_dim * split_idx;
@@ -1818,7 +1831,7 @@ static void intra_recon_tb_leaf(
   uvg_intra_build_reference(state, pu_loc, cu_loc, color, &luma_px, &pic_px, lcu, &refs, cfg->wpp, extra_refs, multi_ref_index, isp_mode);
 
   uvg_pixel pred[32 * 32];
-  uvg_intra_predict(state, &refs, pu_loc, color, pred, search_data, lcu, tree_type);
+  uvg_intra_predict(state, &refs, cu_loc, pu_loc, color, pred, search_data, lcu, tree_type);
 
   const int index = lcu_px.x + lcu_px.y * lcu_width;
   uvg_pixel *block = NULL;
diff --git a/src/intra.h b/src/intra.h
index 5d7d84e7..515abc85 100644
--- a/src/intra.h
+++ b/src/intra.h
@@ -134,6 +134,7 @@ void uvg_intra_predict(
   const encoder_state_t* const state,
   uvg_intra_references* const refs,
   const cu_loc_t* const cu_loc,
+  const cu_loc_t* const pu_loc,
   const color_t color,
   uvg_pixel* dst,
   const intra_search_data_t* data,
@@ -168,7 +169,6 @@ uint8_t uvg_get_mip_flag_context(
 
 int8_t uvg_wide_angle_correction(
   int_fast8_t mode,
-  const bool is_isp,
   const int log2_width,
   const int log2_height,
   const bool account_for_dc_planar);
diff --git a/src/search.c b/src/search.c
index c5480a2e..9c26d160 100644
--- a/src/search.c
+++ b/src/search.c
@@ -741,7 +741,7 @@ static double cu_rd_cost_tr_split_accurate(
 
   if(is_local_sep_tree || tree_type == UVG_LUMA_T) {
 
-    if (uvg_is_lfnst_allowed(state, tr_cu, is_local_sep_tree ? UVG_LUMA_T : tree_type, COLOR_Y, cu_loc)) {
+    if (uvg_is_lfnst_allowed(state, tr_cu, is_local_sep_tree ? UVG_LUMA_T : tree_type, COLOR_Y, cu_loc, lcu)) {
       const int lfnst_idx = tr_cu->lfnst_idx;
       CABAC_FBITS_UPDATE(
         cabac,
@@ -814,7 +814,7 @@ static double cu_rd_cost_tr_split_accurate(
   }
 
   const bool is_chroma_tree = is_local_sep_tree || tree_type == UVG_CHROMA_T;
-  if (uvg_is_lfnst_allowed(state, tr_cu, is_local_sep_tree ? UVG_CHROMA_T : tree_type, is_chroma_tree ? COLOR_UV : COLOR_Y, is_chroma_tree ? cu_loc : chroma_loc)) {
+  if (uvg_is_lfnst_allowed(state, tr_cu, is_local_sep_tree ? UVG_CHROMA_T : tree_type, is_chroma_tree ? COLOR_UV : COLOR_Y, is_chroma_tree ? cu_loc : chroma_loc, lcu)) {
     const int lfnst_idx = is_chroma_tree ? tr_cu->cr_lfnst_idx : tr_cu->lfnst_idx;
     CABAC_FBITS_UPDATE(
       cabac,
@@ -1151,7 +1151,7 @@ static double search_cu(
           uvg_intra_recon_cu(state,
                              &intra_search, chroma_loc,
                              &intra_search.pred_cu, lcu,
-                             tree_type,
+                             is_separate_tree ? UVG_CHROMA_T : tree_type,
                              false,
                              true);
           if(tree_type != UVG_CHROMA_T) {
@@ -1224,7 +1224,7 @@ static double search_cu(
         uvg_intra_recon_cu(state,
                            &intra_search, chroma_loc,
                            cur_cu, lcu,
-                           tree_type,
+                           UVG_CHROMA_T,
                            false,
                            true);
       } else {
diff --git a/src/search_intra.c b/src/search_intra.c
index 2856a7d4..557dff4e 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -660,7 +660,7 @@ static int search_intra_chroma_rough(
   for (int i = 0; i < modes_count; ++i) {
     const int8_t mode_chroma = chroma_data[i].pred_cu.intra.mode_chroma;
     if (mode_chroma == luma_mode || mode_chroma == 0 || mode_chroma >= 81) continue;
-    uvg_intra_predict(state, &refs_u, &loc, COLOR_U, pred, &chroma_data[i], lcu, tree_type);
+    uvg_intra_predict(state, &refs_u, cu_loc, &loc, COLOR_U, pred, &chroma_data[i], lcu, tree_type);
     //costs[i] += get_cost(encoder_state, pred, orig_block, satd_func, sad_func, width);
     switch (width) {
       case 4: chroma_data[i].cost += uvg_satd_4x4(pred, orig_block);
@@ -679,7 +679,7 @@ static int search_intra_chroma_rough(
   for (int i = 0; i < modes_count; ++i) {
     const int8_t mode_chroma = chroma_data[i].pred_cu.intra.mode_chroma;
     if (mode_chroma == luma_mode || mode_chroma == 0 || mode_chroma >= 81) continue;
-    uvg_intra_predict(state, &refs_v, &loc, COLOR_V, pred, &chroma_data[i], lcu, tree_type);
+    uvg_intra_predict(state, &refs_v, cu_loc, &loc, COLOR_V, pred, &chroma_data[i], lcu, tree_type);
     //costs[i] += get_cost(encoder_state, pred, orig_block, satd_func, sad_func, width);
     switch (width) {
       case 4: chroma_data[i].cost += uvg_satd_4x4(pred, orig_block);
@@ -1026,9 +1026,9 @@ static uint8_t search_intra_rough(
 
   int offset = 1 << state->encoder_control->cfg.intra_rough_search_levels;
   search_proxy.pred_cu.intra.mode = 0;
-  uvg_intra_predict(state, refs, cu_loc, COLOR_Y, preds[0], &search_proxy, NULL, UVG_LUMA_T);
+  uvg_intra_predict(state, refs, cu_loc, cu_loc, COLOR_Y, preds[0], &search_proxy, NULL, UVG_LUMA_T);
   search_proxy.pred_cu.intra.mode = 1;
-  uvg_intra_predict(state, refs, cu_loc, COLOR_Y, preds[1], &search_proxy, NULL, UVG_LUMA_T);
+  uvg_intra_predict(state, refs, cu_loc, cu_loc, COLOR_Y, preds[1], &search_proxy, NULL, UVG_LUMA_T);
   get_cost_dual(state, preds, orig_block, satd_dual_func, sad_dual_func, width, height, costs);
   mode_checked[0] = true;
   mode_checked[1] = true;
@@ -1078,7 +1078,7 @@ static uint8_t search_intra_rough(
     for (int i = 0; i < PARALLEL_BLKS; ++i) {
       if (mode + i * offset <= 66) {
         search_proxy.pred_cu.intra.mode = mode + i*offset;
-        uvg_intra_predict(state, refs, cu_loc, COLOR_Y, preds[i], &search_proxy, NULL, UVG_LUMA_T);
+        uvg_intra_predict(state, refs, cu_loc, cu_loc, COLOR_Y, preds[i], &search_proxy, NULL, UVG_LUMA_T);
       }
     }
     
@@ -1150,7 +1150,7 @@ static uint8_t search_intra_rough(
       
         for (int block = 0; block < PARALLEL_BLKS; ++block) {
           search_proxy.pred_cu.intra.mode = modes_to_check[block + i];
-          uvg_intra_predict(state, refs, cu_loc, COLOR_Y, preds[block], &search_proxy, NULL, UVG_LUMA_T);
+          uvg_intra_predict(state, refs, cu_loc, cu_loc, COLOR_Y, preds[block], &search_proxy, NULL, UVG_LUMA_T);
         
         }
 
@@ -1241,7 +1241,7 @@ static void get_rough_cost_for_2n_modes(
   double bits[PARALLEL_BLKS] = { 0 };
   for(int mode = 0; mode < num_modes; mode += PARALLEL_BLKS) {
     for (int i = 0; i < PARALLEL_BLKS; ++i) {
-      uvg_intra_predict(state, &refs[search_data[mode + i].pred_cu.intra.multi_ref_idx], cu_loc, COLOR_Y, preds[i], &search_data[mode + i], NULL, UVG_LUMA_T);
+      uvg_intra_predict(state, &refs[search_data[mode + i].pred_cu.intra.multi_ref_idx], cu_loc, cu_loc, COLOR_Y, preds[i], &search_data[mode + i], NULL, UVG_LUMA_T);
     }
     get_cost_dual(state, preds, orig_block, satd_dual_func, sad_dual_func, width, height, costs_out);
 
@@ -1482,6 +1482,7 @@ int8_t uvg_search_intra_chroma_rdo(
             state,
             &refs[COLOR_U - 1],
             cu_loc,
+            cu_loc,
             COLOR_U,
             u_pred,
             &chroma_data[mode_i],
@@ -1491,6 +1492,7 @@ int8_t uvg_search_intra_chroma_rdo(
             state,
             &refs[COLOR_V - 1],
             cu_loc,
+            cu_loc,
             COLOR_V,
             v_pred,
             &chroma_data[mode_i],
diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c
index 1d3c117f..838bad91 100644
--- a/src/strategies/avx2/intra-avx2.c
+++ b/src/strategies/avx2/intra-avx2.c
@@ -60,7 +60,8 @@ static void uvg_angular_pred_avx2(
   const uvg_pixel *const in_ref_left,
   uvg_pixel *const dst,
   const uint8_t multi_ref_idx,
-  const uint8_t isp_mode)
+  const uint8_t isp_mode,
+  const int cu_dim)
 {
   // ISP_TODO: non-square block implementation, height is passed but not used
   const int width = channel_type == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
diff --git a/src/strategies/generic/dct-generic.c b/src/strategies/generic/dct-generic.c
index d5fdb88e..fec783b6 100644
--- a/src/strategies/generic/dct-generic.c
+++ b/src/strategies/generic/dct-generic.c
@@ -2605,9 +2605,14 @@ static void mts_dct_generic(
     int16_t tmp[32 * 32];
     const int32_t shift_1st = log2_width_minus1 + bitdepth - 8;
     const int32_t shift_2nd = log2_height_minus1 + 7;
-
-    dct_hor(input, tmp, shift_1st, height, 0, skip_width);
-    dct_ver(tmp, output, shift_2nd, width, skip_width, skip_height);
+    if (height == 1) {
+      dct_hor(input, output, shift_1st, height, 0, skip_width);
+    } else if (width == 1) {
+      dct_ver(input, output, shift_2nd, width, 0, skip_height);
+    } else {
+      dct_hor(input, tmp, shift_1st, height, 0, skip_width);
+      dct_ver(tmp, output, shift_2nd, width, skip_width, skip_height);
+    }    
   }
 }
 
@@ -2660,8 +2665,14 @@ static void mts_idct_generic(
     const int32_t shift_1st = transform_matrix_shift + 1;
     const int32_t shift_2nd = (transform_matrix_shift + max_log2_tr_dynamic_range - 1) - bitdepth;
 
-    idct_ver(input, tmp, shift_1st, width, skip_width, skip_height);
-    idct_hor(tmp, output, shift_2nd, height, 0, skip_width);
+    if (height == 1) {
+      idct_hor(input, output, shift_1st, height, 0, skip_width);
+    } else if (width == 1) {
+      idct_ver(input, output, shift_2nd, width, 0, skip_height);
+    } else {
+      idct_ver(input, tmp, shift_1st, width, skip_width, skip_height);
+      idct_hor(tmp, output, shift_2nd, height, 0, skip_width);
+    }
   }
 }
 
diff --git a/src/strategies/generic/intra-generic.c b/src/strategies/generic/intra-generic.c
index b7ab7e94..e00ac48a 100644
--- a/src/strategies/generic/intra-generic.c
+++ b/src/strategies/generic/intra-generic.c
@@ -59,7 +59,8 @@ static void uvg_angular_pred_generic(
   const uvg_pixel *const in_ref_left,
   uvg_pixel *const dst,
   const uint8_t multi_ref_idx,
-  const uint8_t isp_mode)
+  const uint8_t isp_mode,
+  const int cu_dim)
 {
   int width  = channel_type == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
   int height = channel_type == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
@@ -141,10 +142,9 @@ static void uvg_angular_pred_generic(
   // Pointer for the other reference.
   const uvg_pixel *ref_side;
   uvg_pixel* work = width == height || vertical_mode ? dst : temp_dst;
-
-  const int cu_dim = MAX(width, height);
-  const int top_ref_length  = isp_mode ? width + cu_dim  : width << 1;
-  const int left_ref_length = isp_mode ? height + cu_dim : height << 1;
+  
+  const int top_ref_length  = isp_mode == ISP_MODE_VER ? width + cu_dim  : width << 1;
+  const int left_ref_length = isp_mode == ISP_MODE_HOR ? height + cu_dim : height << 1;
 
   // Set ref_main and ref_side such that, when indexed with 0, they point to
   // index 0 in block coordinates.
@@ -338,7 +338,7 @@ static void uvg_intra_pred_planar_generic(
   const int final_shift = 1 + log2_width + log2_height;
   
   // If ISP is enabled log_dim 1 is possible (limit was previously 2)
-  assert((log2_width >= 1 && log2_width <= 5) && (log2_height >= 1 && log2_height <= 5));
+  assert((log2_width >= 2 && log2_width <= 5) &&  log2_height <= 5);
 
   const uvg_pixel top_right = ref_top[width + 1];
   const uvg_pixel bottom_left = ref_left[height + 1];
diff --git a/src/strategies/generic/quant-generic.c b/src/strategies/generic/quant-generic.c
index 8c5649dc..8d2a85da 100644
--- a/src/strategies/generic/quant-generic.c
+++ b/src/strategies/generic/quant-generic.c
@@ -313,15 +313,16 @@ int uvg_quant_cbcr_residual_generic(
 
 
   uvg_transform2d(state->encoder_control, combined_residual, coeff, width, height, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U, cur_cu);
-  if(cur_cu->cr_lfnst_idx) {
-    uvg_fwd_lfnst(cur_cu, width, height, COLOR_UV, cur_cu->cr_lfnst_idx, coeff, tree_type, state->collocated_luma_mode);
+  uint8_t lfnst_idx = tree_type == UVG_CHROMA_T ? cur_cu->cr_lfnst_idx : cur_cu->lfnst_idx;
+  if(lfnst_idx) {
+    uvg_fwd_lfnst(cur_cu, width, height, COLOR_UV, lfnst_idx, coeff, tree_type, state->collocated_luma_mode);
   }
 
   if (state->encoder_control->cfg.rdoq_enable &&
     (width > 4 || !state->encoder_control->cfg.rdoq_skip))
   {
     uvg_rdoq(state, coeff, coeff_out, width, height, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U,
-             scan_order, cur_cu->type, cur_cu->cbf, cur_cu->cr_lfnst_idx);
+             scan_order, cur_cu->type, cur_cu->cbf, lfnst_idx);
   }
   else if (state->encoder_control->cfg.rdoq_enable && false) {
     uvg_ts_rdoq(state, coeff, coeff_out, width, height, cur_cu->joint_cb_cr == 2 ? COLOR_V : COLOR_U,
@@ -329,7 +330,7 @@ int uvg_quant_cbcr_residual_generic(
   }
   else {
     uvg_quant(state, coeff, coeff_out, width, height, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U,
-      scan_order, cur_cu->type, cur_cu->tr_idx == MTS_SKIP && false, cur_cu->cr_lfnst_idx);
+      scan_order, cur_cu->type, cur_cu->tr_idx == MTS_SKIP && false, lfnst_idx);
   }
 
   int8_t has_coeffs = 0;
@@ -348,8 +349,8 @@ int uvg_quant_cbcr_residual_generic(
     // Get quantized residual. (coeff_out -> coeff -> residual)
     uvg_dequant(state, coeff_out, coeff, width, height, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U,
       cur_cu->type, cur_cu->tr_idx == MTS_SKIP && false);
-    if (cur_cu->cr_lfnst_idx) {
-      uvg_inv_lfnst(cur_cu, width, height, COLOR_UV, cur_cu->cr_lfnst_idx, coeff, tree_type, state->collocated_luma_mode);
+    if (lfnst_idx) {
+      uvg_inv_lfnst(cur_cu, width, height, COLOR_UV, lfnst_idx, coeff, tree_type, state->collocated_luma_mode);
     }
     
     uvg_itransform2d(state->encoder_control, combined_residual, coeff, width, height, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U, cur_cu);
@@ -487,7 +488,7 @@ int uvg_quantize_residual_generic(encoder_state_t *const state,
     uvg_transform2d(state->encoder_control, residual, coeff, width, height, color, cur_cu);
   }
 
-  const uint8_t lfnst_index = color == COLOR_Y ? cur_cu->lfnst_idx : cur_cu->cr_lfnst_idx;
+  const uint8_t lfnst_index = tree_type != UVG_CHROMA_T || color == COLOR_Y ? cur_cu->lfnst_idx : cur_cu->cr_lfnst_idx;
 
   if (state->encoder_control->cfg.lfnst && cur_cu->type == CU_INTRA) {
     // Forward low frequency non-separable transform
diff --git a/src/strategies/strategies-intra.h b/src/strategies/strategies-intra.h
index ce008d01..52f5e519 100644
--- a/src/strategies/strategies-intra.h
+++ b/src/strategies/strategies-intra.h
@@ -52,7 +52,8 @@ typedef void (angular_pred_func)(
   const uvg_pixel *const in_ref_left,
   uvg_pixel *const dst,
   const uint8_t multi_ref_idx,
-  const uint8_t isp_mode);
+  const uint8_t isp_mode,
+  const int cu_dim);
 
 typedef void (intra_pred_planar_func)(
   const cu_loc_t* const cu_loc,
diff --git a/src/transform.c b/src/transform.c
index 54ec2ecd..4d953454 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -571,7 +571,7 @@ void uvg_chroma_transform_search(
       SCAN_DIAG,
       &u_has_coeffs,
       &v_has_coeffs,
-      pred_cu->cr_lfnst_idx);
+      tree_type == UVG_CHROMA_T ?  pred_cu->cr_lfnst_idx : pred_cu->lfnst_idx);
     if(pred_cu->cr_lfnst_idx !=0 && !u_has_coeffs && !v_has_coeffs) continue;
     
     if(pred_cu->type == CU_INTRA && transforms[i] != CHROMA_TS && tree_type == UVG_CHROMA_T) {
@@ -720,7 +720,7 @@ void uvg_chroma_transform_search(
         COEFF_ORDER_LINEAR);
     }
     if((depth == 4 || tree_type == UVG_CHROMA_T) && state->encoder_control->cfg.lfnst && 0) {
-      if(uvg_is_lfnst_allowed(state, pred_cu, UVG_CHROMA_T, COLOR_UV, cu_loc)) {
+      if(uvg_is_lfnst_allowed(state, pred_cu, UVG_CHROMA_T, COLOR_UV, cu_loc, lcu)) {
         const int lfnst_idx = pred_cu->cr_lfnst_idx;
         CABAC_FBITS_UPDATE(
           &state->search_cabac,
@@ -873,7 +873,7 @@ void uvg_fwd_lfnst(
   
   const int scan_order = SCAN_DIAG;
 
-  if (lfnst_index && !mts_skip)
+  if (lfnst_index && !mts_skip && (color == COLOR_Y || is_separate_tree))
   {
     assert(log2_width != -1 && "LFNST: invalid block width.");
     const bool whge3 = width >= 8 && height >= 8;
@@ -887,7 +887,12 @@ void uvg_fwd_lfnst(
     }
     assert(intra_mode < NUM_INTRA_MODE && "LFNST: Invalid intra mode.");
     assert(lfnst_index < 3 && "LFNST: Invalid LFNST index. Must be in [0, 2]");
-    int32_t wide_adjusted_mode = uvg_wide_angle_correction(intra_mode, cur_cu->intra.isp_mode != 0, log2_width, log2_height, true);
+    int32_t wide_adjusted_mode = uvg_wide_angle_correction(
+      intra_mode, 
+      color == COLOR_Y ? cur_cu->log2_width : log2_width,
+      color == COLOR_Y ? cur_cu->log2_height : log2_height,
+      true
+      );
     
     // Transform wide angle mode to intra mode
     intra_mode = get_lfnst_intra_mode(wide_adjusted_mode);
@@ -1007,7 +1012,7 @@ void uvg_inv_lfnst(
   bool is_mip = block_is_mip(cur_cu, color, is_separate_tree);
   const int scan_order = SCAN_DIAG;
   
-  if (lfnst_index && !mts_skip) {
+  if (lfnst_index && !mts_skip && (color == COLOR_Y || is_separate_tree)) {
     const bool whge3 = width >= 8 && height >= 8;
     const uint32_t* scan = whge3 ? uvg_coef_top_left_diag_scan_8x8[log2_width] : uvg_g_sig_last_scan[scan_order][log2_width - 1];
     
@@ -1019,7 +1024,12 @@ void uvg_inv_lfnst(
     }
     assert(intra_mode < NUM_INTRA_MODE && "LFNST: Invalid intra mode.");
     assert(lfnst_index < 3 && "LFNST: Invalid LFNST index. Must be in [0, 2]");
-    int32_t wide_adjusted_mode = uvg_wide_angle_correction(intra_mode, cur_cu->intra.isp_mode != 0, log2_width, log2_height, true);
+    int32_t wide_adjusted_mode = uvg_wide_angle_correction(
+      intra_mode, 
+      color == COLOR_Y ? cur_cu->log2_width : log2_width,
+      color == COLOR_Y ? cur_cu->log2_height : log2_height,
+      true
+      );
 
     
     intra_mode = get_lfnst_intra_mode(wide_adjusted_mode); 
@@ -1386,7 +1396,8 @@ void uvg_quantize_lcu_residual(
   // Tell clang-analyzer what is up. For some reason it can't figure out from
   // asserting just depth.
   // Width 2 is possible with ISP blocks // ISP_TODO: no, they actually are not
-  assert(width ==  2 ||
+  assert(width ==  1 ||
+         width ==  2 ||
          width ==  4 ||
          width ==  8 ||
          width == 16 ||

From 412dd20f09dde570fbf652c04db3998aa81ac6ae Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Tue, 6 Dec 2022 14:23:31 +0200
Subject: [PATCH 135/254] [mtt] Fix implicit splits for non ctu divisible
 frames.

---
 src/cu.c                 | 18 +++++++++++++-----
 src/cu.h                 |  1 +
 src/encode_coding_tree.c | 39 ++++++++++++++++++++++++++-------------
 src/encode_coding_tree.h |  1 +
 src/encoderstate.c       |  2 +-
 src/search.c             | 26 ++++++++++++++++----------
 6 files changed, 58 insertions(+), 29 deletions(-)

diff --git a/src/cu.c b/src/cu.c
index 9908d43e..d4ef2881 100644
--- a/src/cu.c
+++ b/src/cu.c
@@ -370,10 +370,17 @@ int uvg_get_split_locs(
 }
 
 
-int uvg_get_implicit_split(const encoder_state_t* const state, const cu_loc_t* const cu_loc)
+int uvg_get_implicit_split(const encoder_state_t* const state, const cu_loc_t* const cu_loc, enum
+                           uvg_tree_type tree_type)
 {
-  bool right_ok = state->tile->frame->width >= cu_loc->x + cu_loc->width;
-  bool bottom_ok = state->tile->frame->height >= cu_loc->y + cu_loc->height;
+  // This checking if cabac is in update state is a very dirty way of checking
+  // whether we are in the search or writing the bitstream, and unfortunately the
+  // coordinates are different for chroma tree in those two conditions. It might be
+  // possible to pass the chroma loc for uvg_get_possible_splits in the search but
+  // then all of the conditions need to be checked in that function.
+  // This current solutions *might* not work with alf enabled but I think it should work
+  bool right_ok = (state->tile->frame->width >> (tree_type == UVG_CHROMA_T && state->cabac.update)) >= cu_loc->x + cu_loc->width;
+  bool bottom_ok = (state->tile->frame->height >> (tree_type == UVG_CHROMA_T && state->cabac.update)) >= cu_loc->y + cu_loc->height;
 
   if (right_ok && bottom_ok) return NO_SPLIT;
   if (right_ok) return BT_HOR_SPLIT;
@@ -387,10 +394,11 @@ int uvg_get_possible_splits(const encoder_state_t * const state,
 {
   const int width = tree_type != UVG_CHROMA_T ? cu_loc->width : cu_loc->chroma_width;
   const int height = tree_type != UVG_CHROMA_T ? cu_loc->height : cu_loc->chroma_height;
-  const enum split_type implicitSplit = uvg_get_implicit_split(state, cu_loc);
+  const enum split_type implicitSplit = uvg_get_implicit_split(state, cu_loc, tree_type);
   const int slice_type = state->frame->is_irap ? (tree_type == UVG_CHROMA_T ? 2 : 0) : 1;
 
-  const unsigned max_btd = state->encoder_control->cfg.max_btt_depth[slice_type]; // +currImplicitBtDepth;
+  const unsigned max_btd =
+    state->encoder_control->cfg.max_btt_depth[slice_type] + split_tree.implicit_mtt_depth;
   const unsigned max_bt_size = state->encoder_control->cfg.max_bt_size[slice_type] >> (tree_type == UVG_CHROMA_T);
   const unsigned min_bt_size = 1 << MIN_SIZE >> (tree_type == UVG_CHROMA_T);
   const unsigned max_tt_size = state->encoder_control->cfg.max_tt_size[slice_type] >> (tree_type == UVG_CHROMA_T);
diff --git a/src/cu.h b/src/cu.h
index 7722852d..11325719 100644
--- a/src/cu.h
+++ b/src/cu.h
@@ -106,6 +106,7 @@ typedef struct  {
   uint32_t split_tree;
   uint8_t current_depth;
   uint8_t mtt_depth;
+  uint8_t implicit_mtt_depth;
   uint8_t part_index;
 } split_tree_t;
 
diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index d3f4de29..e5981eca 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -1239,6 +1239,7 @@ uint8_t uvg_write_split_flag(
   const cu_loc_t* const cu_loc,
   split_tree_t split_tree,
   enum uvg_tree_type tree_type,
+  bool* is_implicit_out,
   double* bits_out)
 {
   double bits = 0;
@@ -1257,7 +1258,10 @@ uint8_t uvg_write_split_flag(
 
   enum split_type split_flag = (split_tree.split_tree >> (split_tree.current_depth * 3)) & 7;
 
-  split_flag = is_implicit ? (can_split[QT_SPLIT] ? QT_SPLIT : (can_split[BT_HOR_SPLIT] ? BT_HOR_SPLIT : BT_VER_SPLIT)) : split_flag;
+  assert(can_split[split_flag] && "Trying to write an illegal split");
+
+  // split_flag = is_implicit ? (can_split[QT_SPLIT] ? QT_SPLIT : (can_split[BT_HOR_SPLIT] ? BT_HOR_SPLIT : BT_VER_SPLIT)) : split_flag;
+  *is_implicit_out = is_implicit;
 
   int split_model = 0;
   if (can_split[NO_SPLIT] && allow_split) {
@@ -1287,7 +1291,9 @@ uint8_t uvg_write_split_flag(
   }
 
 
-  if (!is_implicit && (can_split[BT_HOR_SPLIT] || can_split[BT_VER_SPLIT] || can_split[TT_HOR_SPLIT] || can_split[TT_VER_SPLIT]) && split_flag != NO_SPLIT) {
+  if ((!is_implicit || (can_split[QT_SPLIT] && (can_split[BT_HOR_SPLIT] || can_split[BT_VER_SPLIT]))) 
+    && (can_split[BT_HOR_SPLIT] || can_split[BT_VER_SPLIT] || can_split[TT_HOR_SPLIT] || can_split[TT_VER_SPLIT]) 
+    && split_flag != NO_SPLIT) {
     bool qt_split = split_flag == QT_SPLIT;
     if((can_split[BT_VER_SPLIT] || can_split[BT_HOR_SPLIT] || can_split[TT_VER_SPLIT] || can_split[TT_HOR_SPLIT]) && can_split[QT_SPLIT]) {
       unsigned left_qt_depth = 0;
@@ -1374,12 +1380,9 @@ void uvg_encode_coding_tree(
 
   int32_t frame_width = tree_type !=  UVG_CHROMA_T ? ctrl->in.width : ctrl->in.width / 2;
   int32_t frame_height = tree_type != UVG_CHROMA_T ? ctrl->in.height : ctrl->in.height / 2;
-  // Check for slice border
-  bool border_x = frame_width  < abs_x + cu_width;
-  bool border_y = frame_height < abs_y + cu_height;
-  bool border_split_x = frame_width  >= abs_x + (LCU_WIDTH >> MAX_DEPTH) + cu_width / 2;
-  bool border_split_y = frame_height >= abs_y + (LCU_WIDTH >> MAX_DEPTH) + cu_height / 2;
-  bool border = border_x || border_y; /*!< are we in any border CU */
+
+  // Stop if we are outside of the frame
+  if (abs_x >= frame_width || abs_y >= frame_height) return;
 
   if (depth <= state->frame->max_qp_delta_depth) {
     state->must_code_qp_delta = true;
@@ -1388,6 +1391,7 @@ void uvg_encode_coding_tree(
   // When not in MAX_DEPTH, insert split flag and split the blocks if needed
   if (cu_width + cu_height > 8) {
     split_tree.split_tree = cur_cu->split_tree;
+    bool is_implicit;
     const int split_flag = uvg_write_split_flag(
       state,
       cabac,
@@ -1396,10 +1400,16 @@ void uvg_encode_coding_tree(
       tree_type != UVG_CHROMA_T ? cu_loc : chroma_loc,
       split_tree,
       tree_type,
-      NULL);
+      &is_implicit,
+      NULL
+      );
     
-    if (split_flag || border) {
-      split_tree_t new_split_tree = { cur_cu->split_tree, split_tree.current_depth + 1, split_tree.mtt_depth + (split_flag != QT_SPLIT), 0};
+    if (split_flag != NO_SPLIT) {
+      split_tree_t new_split_tree = { cur_cu->split_tree,
+        split_tree.current_depth + 1,
+        split_tree.mtt_depth + (split_flag != QT_SPLIT),
+        split_tree.implicit_mtt_depth + (split_flag != QT_SPLIT && is_implicit),
+      0};
 
       cu_loc_t new_cu_loc[4];
       cu_loc_t chroma_tree_loc;
@@ -1731,6 +1741,8 @@ double uvg_mock_encode_coding_unit(
 
   // When not in MAX_DEPTH, insert split flag and split the blocks if needed
   if (cur_cu->log2_height + cur_cu->log2_width > 4) {
+    // We do not care about whether the split is implicit or not since there is never split here
+    bool is_implicit;
     uvg_write_split_flag(
       state,
       cabac,
@@ -1738,8 +1750,9 @@ double uvg_mock_encode_coding_unit(
       above_cu,
       cu_loc,
       split_tree,
-      tree_type,
-      &bits);
+      tree_type, &is_implicit,
+      &bits
+      );
   }
 
   // Encode skip flag
diff --git a/src/encode_coding_tree.h b/src/encode_coding_tree.h
index 0e72369e..3df702ef 100644
--- a/src/encode_coding_tree.h
+++ b/src/encode_coding_tree.h
@@ -108,6 +108,7 @@ uint8_t uvg_write_split_flag(
   const cu_loc_t* const cu_loc,
   split_tree_t,
   enum uvg_tree_type tree_type,
+  bool* is_implicit_out,
   double* bits_out);
 
 void uvg_encode_last_significant_xy(cabac_data_t * const cabac,
diff --git a/src/encoderstate.c b/src/encoderstate.c
index 6cd9bbca..32ecfeac 100644
--- a/src/encoderstate.c
+++ b/src/encoderstate.c
@@ -883,7 +883,7 @@ static void encoder_state_worker_encode_lcu_bitstream(void * opaque)
   //Encode coding tree
   cu_loc_t start;
   uvg_cu_loc_ctor(&start, lcu->position.x * LCU_WIDTH, lcu->position.y * LCU_WIDTH, LCU_WIDTH, LCU_WIDTH);
-  split_tree_t split_tree = { 0, 0, 0 };
+  split_tree_t split_tree = { 0, 0, 0, 0, 0 };
 
   uvg_encode_coding_tree(state, lcu->coeff, tree_type, &start, &start, split_tree, true);
 
diff --git a/src/search.c b/src/search.c
index 9c26d160..c23540c9 100644
--- a/src/search.c
+++ b/src/search.c
@@ -1382,19 +1382,13 @@ static double search_cu(
         || (tree_type == UVG_CHROMA_T && split_type == TT_HOR_SPLIT && cu_loc->chroma_height == 8)
         || (tree_type == UVG_CHROMA_T && split_type == BT_HOR_SPLIT && cu_loc->chroma_height == 4))
         continue;
-      split_tree_t new_split = {
-        split_tree.split_tree | split_type << (split_tree.current_depth * 3),
-        split_tree.current_depth + 1,
-        split_tree.mtt_depth + (split_type != QT_SPLIT),
-        0
-      };
-    
       double split_cost = 0.0;
       int cbf = cbf_is_set_any(cur_cu->cbf);
       memcpy(&state->search_cabac, &pre_search_cabac, sizeof(post_seach_cabac));
 
 
       double split_bits = 0;
+      bool is_implicit = false;
 
       if (cur_cu->log2_height + cur_cu->log2_width > 4) {
 
@@ -1427,9 +1421,20 @@ static double search_cu(
           tree_type != UVG_CHROMA_T ? cu_loc : &separate_tree_chroma_loc,
           count_tree,
           tree_type,
-          &split_bits);
+          &is_implicit,
+          &split_bits
+          );
       }
 
+
+      split_tree_t new_split = {
+        split_tree.split_tree | split_type << (split_tree.current_depth * 3),
+        split_tree.current_depth + 1,
+        split_tree.mtt_depth + (split_type != QT_SPLIT),
+        split_tree.implicit_mtt_depth + (split_type != QT_SPLIT && is_implicit),
+        0
+      };
+
       state->search_cabac.update = 0;
       split_cost += split_bits * state->lambda;
 
@@ -1489,7 +1494,8 @@ static double search_cu(
         double bits = 0;
         uvg_write_split_flag(state, &state->search_cabac,
                              x > 0 ? LCU_GET_CU_AT_PX(lcu, SUB_SCU(x) - 1, SUB_SCU(y)) : NULL,
-                             y > 0 ? LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y) - 1) : NULL, cu_loc, split_tree, tree_type, &bits);
+                             y > 0 ? LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y) - 1) : NULL, cu_loc, split_tree, tree_type, NULL,
+                             &bits);
 
         cur_cu->intra = cu_d1->intra;
         cur_cu->type = CU_INTRA;
@@ -1783,7 +1789,7 @@ void uvg_search_lcu(encoder_state_t * const state, const int x, const int y, con
 
   cu_loc_t start;
   uvg_cu_loc_ctor(&start, x, y, LCU_WIDTH, LCU_WIDTH);
-  split_tree_t split_tree = { 0, 0, 0 };
+  split_tree_t split_tree = { 0, 0, 0, 0, 0 };
   // Start search from depth 0.
   double cost = search_cu(
     state, 

From ba0d43d846f6d1b90aa56696878f0f1b14807bcc Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 7 Dec 2022 14:56:40 +0200
Subject: [PATCH 136/254] [mtt] Fill chroma data for the whole area covered by
 the local separate tree chroma cu

---
 src/cu.h                 |   2 +-
 src/encode_coding_tree.c |   6 +-
 src/intra.c              |  15 +++-
 src/search.c             | 145 ++++++++++++++++++++++++++-------------
 src/search_intra.c       |   5 +-
 src/transform.c          |   4 +-
 6 files changed, 116 insertions(+), 61 deletions(-)

diff --git a/src/cu.h b/src/cu.h
index 11325719..751a483c 100644
--- a/src/cu.h
+++ b/src/cu.h
@@ -150,7 +150,7 @@ typedef struct
   uint8_t mts_last_scan_pos : 1;
 
   uint8_t violates_lfnst_constrained_luma : 1;
-  uint8_t violates_lfnst_constrained_chroma : 1;
+  uint8_t violates_lfnst_constrained_chroma;
   uint8_t lfnst_last_scan_pos : 1;
   uint8_t lfnst_idx : 2;
   uint8_t cr_lfnst_idx : 2;
diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index e5981eca..c3400524 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -1438,7 +1438,7 @@ void uvg_encode_coding_tree(
   
   DBG_YUVIEW_VALUE(state->frame->poc, DBG_YUVIEW_CU_TYPE, abs_x, abs_y, cu_width, cu_height, cur_cu->type-1);
 
-  //fprintf(stderr, "%4d %4d %2d %2d %d\n", x, y, cu_width, cu_height, has_chroma);
+  //fprintf(stderr, "%4d %4d %2d %2d %d %d\n", x, y, cu_width, cu_height, has_chroma, cur_cu->split_tree);
 
   if (ctrl->cfg.lossless) {
     cabac->cur_ctx = &cabac->ctx.cu_transquant_bypass;
@@ -1668,11 +1668,11 @@ void uvg_encode_coding_tree(
       int8_t luma_dir = uvg_get_co_located_luma_mode(tree_type != UVG_CHROMA_T ? chroma_loc : cu_loc, cu_loc, cur_cu, NULL, frame->cu_array, UVG_CHROMA_T);
       encode_chroma_intra_cu(cabac, cur_cu, state->encoder_control->cfg.cclm && uvg_cclm_is_allowed(state, cu_loc, cur_cu, tree_type), luma_dir,NULL);
       // LFNST constraints must be reset here. Otherwise the left over values will interfere when calculating new constraints
-      cu_info_t* tmp = (cu_info_t*)cur_cu;
+      cu_info_t* tmp = uvg_cu_array_at((cu_array_t *)used_array, chroma_loc->x, chroma_loc->y);
       tmp->violates_lfnst_constrained_luma = false;
       tmp->violates_lfnst_constrained_chroma = false;
       tmp->lfnst_last_scan_pos = false;
-      encode_transform_coeff(state, chroma_loc, 1, coeff, cur_cu, tree_type, true, false, &luma_cbf_ctx, chroma_loc, chroma_loc);
+      encode_transform_coeff(state, chroma_loc, 1, coeff, NULL, tree_type, true, false, &luma_cbf_ctx, chroma_loc, chroma_loc);
       // Write LFNST only once for single tree structure
       encode_lfnst_idx(state, cabac, tmp, is_local_dual_tree ? UVG_CHROMA_T : tree_type, COLOR_UV, chroma_loc);
     }
diff --git a/src/intra.c b/src/intra.c
index 429254c1..e39878df 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -1884,7 +1884,14 @@ void uvg_intra_recon_cu(
   bool recon_chroma)
 {
   const uint8_t depth = 6 - uvg_g_convert_to_log2[cu_loc->width];
-  const vector2d_t lcu_px = { cu_loc->local_x >> (tree_type == UVG_CHROMA_T), cu_loc->local_y >> (tree_type == UVG_CHROMA_T) };
+  const vector2d_t lcu_px = {
+    cu_loc->local_x >>
+      (tree_type == UVG_CHROMA_T && state->encoder_control->cfg.dual_tree &&
+       state->frame->slicetype == UVG_SLICE_I),
+    cu_loc->local_y >>
+      (tree_type == UVG_CHROMA_T && state->encoder_control->cfg.dual_tree &&
+       state->frame->slicetype == UVG_SLICE_I),
+  };
   const int8_t width = cu_loc->width;
   const int8_t height = cu_loc->height;
   if (cur_cu == NULL) {
@@ -1917,7 +1924,11 @@ void uvg_intra_recon_cu(
     cu_loc_t split_cu_loc[4];
     const int split_count = uvg_get_split_locs(cu_loc, split, split_cu_loc,NULL);
     for (int i = 0; i < split_count; ++i) {
-      uvg_intra_recon_cu(state, search_data, &split_cu_loc[i], NULL, lcu, tree_type, recon_luma, recon_chroma);
+      uvg_intra_recon_cu(
+        state, search_data, &split_cu_loc[i],
+        NULL, lcu,
+        state->encoder_control->cfg.dual_tree && state->frame->slicetype == UVG_SLICE_I ? tree_type : UVG_BOTH_T, 
+        recon_luma, recon_chroma);
     }
 
     return;
diff --git a/src/search.c b/src/search.c
index c23540c9..5f141d4d 100644
--- a/src/search.c
+++ b/src/search.c
@@ -134,7 +134,15 @@ static INLINE void initialize_partial_work_tree(lcu_t* from, lcu_t *to, const cu
     const int offset = chroma_loc->local_x / 2 + chroma_loc->local_y / 2 * LCU_WIDTH_C;
     uvg_pixels_blit(&from->ref.u[offset], &to->ref.u[offset], chroma_loc->chroma_width, chroma_loc->chroma_height, LCU_WIDTH_C, LCU_WIDTH_C);
     uvg_pixels_blit(&from->ref.v[offset], &to->ref.v[offset], chroma_loc->chroma_width, chroma_loc->chroma_height, LCU_WIDTH_C, LCU_WIDTH_C);
-  } 
+  }
+  if(chroma_loc->local_y != cu_loc->local_y || chroma_loc->local_x != cu_loc->local_x && tree_type == UVG_BOTH_T) {
+    for (int y = chroma_loc->local_y; y < chroma_loc->local_y + chroma_loc->height; y += SCU_WIDTH) {
+      for (int x = chroma_loc->local_x; x < chroma_loc->local_x + chroma_loc->width; x += SCU_WIDTH) {
+        memset(LCU_GET_CU_AT_PX(to, x, y), 0, sizeof(cu_info_t));
+      }
+    }
+    
+  }
 
   const int y_start = (cu_loc->local_y >> (tree_type == UVG_CHROMA_T)) - 4;
   const int x_start = (cu_loc->local_x >> (tree_type == UVG_CHROMA_T)) - 4;
@@ -217,6 +225,8 @@ static INLINE void copy_cu_coeffs(const cu_loc_t *cu_loc, lcu_t *from, lcu_t *to
   }
 }
 
+
+static void lcu_fill_chroma_cu_info(lcu_t* lcu, const cu_loc_t* const cu_loc);
 /**
  * Copy all non-reference CU data from next level to current level.
  */
@@ -235,7 +245,20 @@ static void work_tree_copy_up(
   if (chroma_loc && tree_type != UVG_LUMA_T) {
     copy_cu_pixels(from, to, chroma_loc, UVG_CHROMA_T);
     copy_cu_coeffs(chroma_loc, from, to, joint, UVG_CHROMA_T);
+
+    for (int y = chroma_loc->local_y; y < chroma_loc->local_y + chroma_loc->height; y += 4) {
+      for (int x = chroma_loc->local_x; x < chroma_loc->local_x + chroma_loc->width; x += 4) {
+        cu_info_t* to_cu = LCU_GET_CU_AT_PX(to, x, y);
+        cu_info_t* from_cu = LCU_GET_CU_AT_PX(from, x, y);
+        to_cu->intra.mode_chroma = from_cu->intra.mode_chroma;
+        to_cu->joint_cb_cr = from_cu->joint_cb_cr;
+        to_cu->cr_lfnst_idx = from_cu->cr_lfnst_idx;
+        cbf_copy(&to_cu->cbf, from_cu->cbf, COLOR_U);
+        cbf_copy(&to_cu->cbf, from_cu->cbf, COLOR_V);
+      }
+    }
   }
+  
 }
 
 
@@ -250,6 +273,8 @@ static void lcu_fill_cu_info(lcu_t *lcu, int x_local, int y_local, int width, in
       to->split_tree = cu->split_tree;
       //to->tr_idx    = cu->tr_idx;
       to->lfnst_idx = cu->lfnst_idx;
+      to->cr_lfnst_idx = cu->cr_lfnst_idx;
+      to->joint_cb_cr = cu->joint_cb_cr;
       to->lfnst_last_scan_pos = cu->lfnst_last_scan_pos;
       to->violates_lfnst_constrained_luma = cu->violates_lfnst_constrained_luma;
       to->violates_lfnst_constrained_chroma = cu->violates_lfnst_constrained_chroma;
@@ -274,23 +299,42 @@ static void lcu_fill_cu_info(lcu_t *lcu, int x_local, int y_local, int width, in
   }
 }
 
-
-static void lcu_fill_cbf(lcu_t *lcu, int x_local, unsigned y_local, unsigned width, unsigned height, const cu_info_t *cur_cu)
+static void lcu_fill_chroma_cu_info(lcu_t *lcu, const cu_loc_t * const cu_loc)
 {
-  const uint32_t x_mask = ~((MIN(width, TR_MAX_WIDTH))-1);
-  const uint32_t y_mask = ~((MIN(height, TR_MAX_WIDTH))-1);
+  // The bottom right cu will always have the chroma info
+  cu_info_t *bottom_right = LCU_GET_CU_AT_PX(
+    lcu,
+    cu_loc->local_x + cu_loc->width - 1,
+    cu_loc->local_y + cu_loc->height - 1);
+  if(bottom_right->type != CU_INTRA) return;
 
+
+  for(int y = cu_loc->local_y; y < cu_loc->local_y + cu_loc->height; y += 4 ) {
+    for (int x = cu_loc->local_x; x < cu_loc->local_x + cu_loc->width; x += 4) {
+      cu_info_t *cu         = LCU_GET_CU_AT_PX(lcu, x, y);
+      cu->intra.mode_chroma = bottom_right->intra.mode_chroma;
+      cu->joint_cb_cr       = bottom_right->joint_cb_cr;
+      cu->cr_lfnst_idx      = bottom_right->cr_lfnst_idx;
+    }
+  }
+}
+  
+
+
+static void lcu_fill_cbf(lcu_t *lcu, int x_local, unsigned y_local, unsigned width, unsigned height, const cu_info_t *cur_cu, enum
+                         uvg_tree_type tree_type)
+{
   // Set coeff flags in every CU covered by part_mode in this depth.
-  for (uint32_t y = y_local; y < y_local + height; y += SCU_WIDTH) {
-    for (uint32_t x = x_local; x < x_local + width; x += SCU_WIDTH) {
+  for (uint32_t y = 0; y < height; y += SCU_WIDTH) {
+    for (uint32_t x = 0; x < width; x += SCU_WIDTH) {
       // Use TU top-left CU to propagate coeff flags
-      cu_info_t *cu_from = LCU_GET_CU_AT_PX(lcu, x & x_mask, y & y_mask);
-      cu_info_t *cu_to   = LCU_GET_CU_AT_PX(lcu, x, y);
+      cu_info_t *cu_from = LCU_GET_CU_AT_PX(lcu, x_local + (x & ~(TR_MAX_WIDTH - 1)), y_local + (y & ~(TR_MAX_WIDTH - 1)));
+      cu_info_t *cu_to   = LCU_GET_CU_AT_PX(lcu, x_local + x, y_local + y);
       if (cu_from != cu_to) {
         // Chroma and luma coeff data is needed for deblocking
-        cbf_copy(&cu_to->cbf, cu_from->cbf, COLOR_Y);
-        cbf_copy(&cu_to->cbf, cu_from->cbf, COLOR_U);
-        cbf_copy(&cu_to->cbf, cu_from->cbf, COLOR_V);
+        if(tree_type != UVG_CHROMA_T) cbf_copy(&cu_to->cbf, cu_from->cbf, COLOR_Y);
+        if(tree_type != UVG_LUMA_T) cbf_copy(&cu_to->cbf, cu_from->cbf, COLOR_U);
+        if (tree_type != UVG_LUMA_T)cbf_copy(&cu_to->cbf, cu_from->cbf, COLOR_V);
       }
     }
   }
@@ -1090,7 +1134,6 @@ static double search_cu(
     if (can_use_intra && !skip_intra) {
       intra_search.pred_cu = *cur_cu;
       if(tree_type != UVG_CHROMA_T) {
-        intra_search.pred_cu.joint_cb_cr = 4;
         uvg_search_cu_intra(state, &intra_search, lcu, tree_type, cu_loc);
       }
 #ifdef COMPLETE_PRED_MODE_BITS
@@ -1136,11 +1179,6 @@ static double search_cu(
           intra_search.pred_cu.intra.mode_chroma = intra_mode;
           if (ctrl->cfg.rdo >= 2 || ctrl->cfg.jccr || ctrl->cfg.lfnst) {
             uvg_search_cu_intra_chroma(state, chroma_loc, lcu, &intra_search, intra_mode, tree_type, is_separate_tree);
-
-            if (intra_search.pred_cu.joint_cb_cr == 0) {
-              intra_search.pred_cu.joint_cb_cr = 4;
-            }
-
           }
           else if (!intra_search.pred_cu.intra.mip_flag) {
             intra_search.pred_cu.intra.mode_chroma = intra_mode;
@@ -1221,16 +1259,26 @@ static double search_cu(
       if((!recon_chroma && state->encoder_control->chroma_format != UVG_CSP_400 && tree_type != UVG_LUMA_T) 
         || tree_type == UVG_CHROMA_T) {
         intra_search.pred_cu.intra.mode_chroma = cur_cu->intra.mode_chroma;
+        lcu_fill_chroma_cu_info(
+          lcu,
+          chroma_loc);
         uvg_intra_recon_cu(state,
                            &intra_search, chroma_loc,
-                           cur_cu, lcu,
+                           NULL, lcu,
                            UVG_CHROMA_T,
                            false,
                            true);
+        lcu_fill_cbf(
+          lcu,
+          chroma_loc->local_x,
+          chroma_loc->local_y,
+          chroma_loc->width,
+          chroma_loc->height,
+          cur_cu,
+          UVG_CHROMA_T);
       } else {
         assert(cur_cu->cr_lfnst_idx == 0 && "If we don't have separate tree chroma lfnst index must be 0");
       }
-      if (cur_cu->joint_cb_cr == 4) cur_cu->joint_cb_cr = 0;
 
       // Set isp split cbfs here
       const int split_type = intra_search.pred_cu.intra.isp_mode;
@@ -1302,7 +1350,7 @@ static double search_cu(
         }
       }
       lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_height, cur_cu);
-      lcu_fill_cbf(lcu, x_local, y_local, cu_width, cu_height, cur_cu);
+      lcu_fill_cbf(lcu, x_local, y_local, cu_width, cu_height, cur_cu, UVG_BOTH_T);
     }
   }
 
@@ -1369,21 +1417,27 @@ static double search_cu(
   uvg_get_possible_splits(state, cu_loc, split_tree, tree_type, can_split);
   can_split_cu &= can_split[1] || can_split[2] || can_split[3] || can_split[4] || can_split[5];
 
-  // Recursively split all the way to max search depth.
-  if (can_split_cu) {
+
+  // If skip mode was selected for the block, skip further search.
+  // Skip mode means there's no coefficients in the block, so splitting
+  // might not give any better results but takes more time to do.
+  // It is ok to interrupt the search as soon as it is known that
+  // the split costs at least as much as not splitting.
+  int cbf = cbf_is_set_any(cur_cu->cbf);
+  if (can_split_cu && (cur_cu->type == CU_NOTSET || cbf || state->encoder_control->cfg.cu_split_termination == UVG_CU_SPLIT_TERMINATION_OFF)) {
     lcu_t * split_lcu = MALLOC(lcu_t, 5);
     enum split_type best_split = 0;
     double best_split_cost = MAX_DOUBLE;
     cabac_data_t post_seach_cabac;
     cabac_data_t best_split_cabac;
     memcpy(&post_seach_cabac, &state->search_cabac, sizeof(post_seach_cabac));
+    // Recursively split all the way to max search depth.
     for (int split_type = QT_SPLIT; split_type <= TT_VER_SPLIT; ++split_type) {
       if (!can_split[split_type] 
         || (tree_type == UVG_CHROMA_T && split_type == TT_HOR_SPLIT && cu_loc->chroma_height == 8)
         || (tree_type == UVG_CHROMA_T && split_type == BT_HOR_SPLIT && cu_loc->chroma_height == 4))
         continue;
       double split_cost = 0.0;
-      int cbf = cbf_is_set_any(cur_cu->cbf);
       memcpy(&state->search_cabac, &pre_search_cabac, sizeof(post_seach_cabac));
 
 
@@ -1438,32 +1492,24 @@ static double search_cu(
       state->search_cabac.update = 0;
       split_cost += split_bits * state->lambda;
 
-      // If skip mode was selected for the block, skip further search.
-      // Skip mode means there's no coefficients in the block, so splitting
-      // might not give any better results but takes more time to do.
-      // It is ok to interrupt the search as soon as it is known that
-      // the split costs at least as much as not splitting.
-      if (cur_cu->type == CU_NOTSET || cbf || state->encoder_control->cfg.cu_split_termination == UVG_CU_SPLIT_TERMINATION_OFF) {
-        cu_loc_t new_cu_loc[4];
-        uint8_t separate_chroma = 0;
-        const int splits = uvg_get_split_locs(cu_loc, split_type, new_cu_loc, &separate_chroma);
-        initialize_partial_work_tree(lcu, &split_lcu[split_type - 1], cu_loc, separate_chroma ? chroma_loc : cu_loc , tree_type);
-        for (int split = 0; split < splits; ++split) {
-          new_split.part_index = split;
-          split_cost += search_cu(state, 
-            &new_cu_loc[split], separate_chroma ? chroma_loc : &new_cu_loc[split],
-            &split_lcu[split_type -1], 
-            tree_type, new_split,
-            !separate_chroma || (split == splits - 1 && has_chroma));
-          // If there is no separate chroma the block will always have chroma, otherwise it is the last block of the split that has the chroma
-          if (split_cost > cost || split_cost > best_split_cost) {
-            break;
-          }
+      cu_loc_t new_cu_loc[4];
+      uint8_t separate_chroma = 0;
+      const int splits = uvg_get_split_locs(cu_loc, split_type, new_cu_loc, &separate_chroma);
+      initialize_partial_work_tree(lcu, &split_lcu[split_type - 1], cu_loc, separate_chroma ? chroma_loc : cu_loc , tree_type);
+      for (int split = 0; split < splits; ++split) {
+        new_split.part_index = split;
+        split_cost += search_cu(state, 
+          &new_cu_loc[split], separate_chroma ? chroma_loc : &new_cu_loc[split],
+          &split_lcu[split_type -1], 
+          tree_type, new_split,
+          !separate_chroma || (split == splits - 1 && has_chroma));
+        // If there is no separate chroma the block will always have chroma, otherwise it is the last block of the split that has the chroma
+        if (split_cost > cost || split_cost > best_split_cost) {
+          break;
         }
-
-      } else {
-        split_cost = INT_MAX;
       }
+
+      
       if (split_cost < best_split_cost) {
         best_split_cost = split_cost;
         best_split = split_type;
@@ -1492,9 +1538,10 @@ static double search_cu(
         memcpy(&state->search_cabac, &pre_search_cabac, sizeof(pre_search_cabac));
         cost = 0;
         double bits = 0;
+        bool   is_implicit = false;
         uvg_write_split_flag(state, &state->search_cabac,
                              x > 0 ? LCU_GET_CU_AT_PX(lcu, SUB_SCU(x) - 1, SUB_SCU(y)) : NULL,
-                             y > 0 ? LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y) - 1) : NULL, cu_loc, split_tree, tree_type, NULL,
+                             y > 0 ? LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y) - 1) : NULL, cu_loc, split_tree, tree_type, &is_implicit,
                              &bits);
 
         cur_cu->intra = cu_d1->intra;
diff --git a/src/search_intra.c b/src/search_intra.c
index 557dff4e..30110927 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -338,7 +338,6 @@ static double search_intra_trdepth(
       num_transforms = MAX(num_transforms, 2);
     }
     pred_cu->intra.mode_chroma = -1;
-    pred_cu->joint_cb_cr = 4;
     
     const int max_tb_size = TR_MAX_WIDTH;
     // LFNST search params
@@ -489,7 +488,6 @@ static double search_intra_trdepth(
       if (reconstruct_chroma) {
         int8_t luma_mode = pred_cu->intra.mode;
         pred_cu->intra.mode_chroma = chroma_mode;
-        pred_cu->joint_cb_cr = 4;
         // TODO: Maybe check the jccr mode here also but holy shit is the interface of search_intra_rdo bad currently
         uvg_intra_recon_cu(
           state,
@@ -544,7 +542,6 @@ static double search_intra_trdepth(
     if(reconstruct_chroma) {
       int8_t luma_mode = pred_cu->intra.mode;
       pred_cu->intra.mode_chroma = chroma_mode;
-      pred_cu->joint_cb_cr= 4; // TODO: Maybe check the jccr mode here also but holy shit is the interface of search_intra_rdo bad currently
       uvg_intra_recon_cu(state,
                          search_data, cu_loc,
                          pred_cu, lcu,
@@ -1623,7 +1620,7 @@ int8_t uvg_search_cu_intra_chroma(
     chroma_data[i].pred_cu = *cur_pu;
     chroma_data[i].pred_cu.intra.mode_chroma = num_modes == 1 ? luma_mode : modes[i];
     chroma_data[i].cost = 0;
-    if(cu_loc->width != 4 && tree_type == UVG_BOTH_T) {
+    if(!is_separate && tree_type == UVG_BOTH_T) {
       memcpy(chroma_data[i].lfnst_costs, search_data->lfnst_costs, sizeof(double) * 3);
     }
   }
diff --git a/src/transform.c b/src/transform.c
index 4d953454..34e246ce 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -863,7 +863,7 @@ void uvg_fwd_lfnst(
   const uint32_t log2_width = uvg_g_convert_to_log2[width];
   const uint32_t log2_height = uvg_g_convert_to_log2[height];
   int8_t intra_mode = (color == COLOR_Y) ? cur_cu->intra.mode : cur_cu->intra.mode_chroma;
-  bool mts_skip = cur_cu->tr_idx == MTS_SKIP;
+  bool mts_skip = cur_cu->tr_idx == MTS_SKIP && color == COLOR_Y;
   // This check is safe for 8x16 cus split with TT, since it is checking the dimensions of the
   // last luma CU which will be 8x4, i.e., 3 + 2 < 6
   bool is_separate_tree = cur_cu->log2_height + cur_cu->log2_width < 6 || tree_type != UVG_BOTH_T;
@@ -1005,7 +1005,7 @@ void uvg_inv_lfnst(
   const uint32_t log2_width = uvg_g_convert_to_log2[width];
   const uint32_t log2_height = uvg_g_convert_to_log2[height];
   int8_t intra_mode = (color == COLOR_Y) ? cur_cu->intra.mode : cur_cu->intra.mode_chroma;
-  bool mts_skip = cur_cu->tr_idx == MTS_SKIP;
+  bool mts_skip = cur_cu->tr_idx == MTS_SKIP && color == COLOR_Y;
   bool is_separate_tree = cur_cu->log2_height + cur_cu->log2_width < 6 || tree_type != UVG_BOTH_T;
   bool is_cclm_mode = (intra_mode >= 81 && intra_mode <= 83); // CCLM modes are in [81, 83]
 

From 992182dafb0d7d7fb2a27c374ed5d6ffb38091ce Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Thu, 8 Dec 2022 14:52:01 +0200
Subject: [PATCH 137/254] WIP

---
 src/encode_coding_tree.c | 3 ++-
 src/search.c             | 5 +++++
 src/search_intra.c       | 7 +++++--
 3 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index c3400524..7a7395e1 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -1415,6 +1415,7 @@ void uvg_encode_coding_tree(
       cu_loc_t chroma_tree_loc;
       uint8_t separate_chroma = 0;
       const int splits = uvg_get_split_locs(cu_loc, split_flag, new_cu_loc, &separate_chroma);
+      separate_chroma |= !has_chroma;
       for (int split = 0; split <splits; ++split) {
         new_split_tree.part_index = split;
         if (tree_type == UVG_CHROMA_T) {
@@ -1438,7 +1439,7 @@ void uvg_encode_coding_tree(
   
   DBG_YUVIEW_VALUE(state->frame->poc, DBG_YUVIEW_CU_TYPE, abs_x, abs_y, cu_width, cu_height, cur_cu->type-1);
 
-  //fprintf(stderr, "%4d %4d %2d %2d %d %d\n", x, y, cu_width, cu_height, has_chroma, cur_cu->split_tree);
+  fprintf(stderr, "%4d %4d %2d %2d %d %d\n", x, y, cu_width, cu_height, has_chroma, cur_cu->split_tree);
 
   if (ctrl->cfg.lossless) {
     cabac->cur_ctx = &cabac->ctx.cu_transquant_bypass;
diff --git a/src/search.c b/src/search.c
index 5f141d4d..d18a416b 100644
--- a/src/search.c
+++ b/src/search.c
@@ -315,6 +315,8 @@ static void lcu_fill_chroma_cu_info(lcu_t *lcu, const cu_loc_t * const cu_loc)
       cu->intra.mode_chroma = bottom_right->intra.mode_chroma;
       cu->joint_cb_cr       = bottom_right->joint_cb_cr;
       cu->cr_lfnst_idx      = bottom_right->cr_lfnst_idx;
+      cu->type = bottom_right->type;
+      cu->tr_skip |= bottom_right->tr_skip & 6;
     }
   }
 }
@@ -1015,6 +1017,7 @@ static double search_cu(
   const int y = cu_loc->y;
   const int luma_width = cu_loc->width;
   const int luma_height = cu_loc->height;
+  
   const bool is_separate_tree = chroma_loc == NULL || cu_loc->height != chroma_loc->height || cu_loc->width != chroma_loc->width;
   assert(cu_width >= 4);
   double cost = MAX_DOUBLE;
@@ -1381,6 +1384,7 @@ static double search_cu(
     cost = bits * state->lambda;
 
     cost += cu_rd_cost_tr_split_accurate(state, cur_cu, lcu, tree_type, 0, cu_loc, chroma_loc, has_chroma);
+    //fprintf(stderr, "%4d %4d %2d %2d %d %d %f\n", x, y, cu_width, cu_height, has_chroma, cur_cu->split_tree, cost);
     
     //if (ctrl->cfg.zero_coeff_rdo && inter_zero_coeff_cost <= cost) {
     //  cost = inter_zero_coeff_cost;
@@ -1495,6 +1499,7 @@ static double search_cu(
       cu_loc_t new_cu_loc[4];
       uint8_t separate_chroma = 0;
       const int splits = uvg_get_split_locs(cu_loc, split_type, new_cu_loc, &separate_chroma);
+      separate_chroma |= !has_chroma;
       initialize_partial_work_tree(lcu, &split_lcu[split_type - 1], cu_loc, separate_chroma ? chroma_loc : cu_loc , tree_type);
       for (int split = 0; split < splits; ++split) {
         new_split.part_index = split;
diff --git a/src/search_intra.c b/src/search_intra.c
index 30110927..f497ea4e 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -338,6 +338,7 @@ static double search_intra_trdepth(
       num_transforms = MAX(num_transforms, 2);
     }
     pred_cu->intra.mode_chroma = -1;
+    pred_cu->joint_cb_cr = 4;
     
     const int max_tb_size = TR_MAX_WIDTH;
     // LFNST search params
@@ -488,6 +489,7 @@ static double search_intra_trdepth(
       if (reconstruct_chroma) {
         int8_t luma_mode = pred_cu->intra.mode;
         pred_cu->intra.mode_chroma = chroma_mode;
+        pred_cu->joint_cb_cr = 4;
         // TODO: Maybe check the jccr mode here also but holy shit is the interface of search_intra_rdo bad currently
         uvg_intra_recon_cu(
           state,
@@ -542,6 +544,7 @@ static double search_intra_trdepth(
     if(reconstruct_chroma) {
       int8_t luma_mode = pred_cu->intra.mode;
       pred_cu->intra.mode_chroma = chroma_mode;
+      pred_cu->joint_cb_cr= 4; // TODO: Maybe check the jccr mode here also but holy shit is the interface of search_intra_rdo bad currently
       uvg_intra_recon_cu(state,
                          search_data, cu_loc,
                          pred_cu, lcu,
@@ -561,7 +564,7 @@ static double search_intra_trdepth(
     // Early stop condition for the recursive search.
     // If the cost of any 1/4th of the transform is already larger than the
     // whole transform, assume that splitting further is a bad idea.
-    if (nosplit_cost >= cost_treshold) {
+    if (nosplit_cost <= cost_treshold) {
       return nosplit_cost;
     }
   }
@@ -1445,7 +1448,7 @@ int8_t uvg_search_intra_chroma_rdo(
     const int offset = ((cu_loc->local_x) >> 1) + ((cu_loc->local_y) >> 1)* LCU_WIDTH_C;
 
     int lfnst_modes_to_check[3];
-    if((is_separate || tree_type == UVG_CHROMA_T) && state->encoder_control->cfg.lfnst) {
+    if((is_separate || tree_type == UVG_CHROMA_T) && state->encoder_control->cfg.lfnst && PU_IS_TU(&chroma_data->pred_cu) ) {
       for (int i = 0; i < 3; ++i) {
         lfnst_modes_to_check[i] = i;
       }

From 09baddef17e69552f25d251cf1d3cc7ecf8cd526 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Mon, 12 Dec 2022 10:05:17 +0200
Subject: [PATCH 138/254] [mtt] Fix lfnst and chroma coeffs and tests

---
 src/encode_coding_tree.c               |  2 +-
 src/search.c                           | 15 +++++++++++----
 tests/check_cabac_state_consistency.py |  4 ++--
 3 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index 7a7395e1..3be715d4 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -1685,7 +1685,7 @@ void uvg_encode_coding_tree(
     exit(1);
   }
   if (state->encoder_control->cabac_debug_file) {
-    fprintf(state->encoder_control->cabac_debug_file, "E %4d %4d %d %d", x << (tree_type == UVG_CHROMA_T), y << (tree_type == UVG_CHROMA_T), depth, tree_type);
+    fprintf(state->encoder_control->cabac_debug_file, "E %4d %4d %9d %d", x << (tree_type == UVG_CHROMA_T), y << (tree_type == UVG_CHROMA_T), split_tree.split_tree, tree_type);
     fwrite(&cabac->ctx, 1, sizeof(cabac->ctx), state->encoder_control->cabac_debug_file);
   }
 
diff --git a/src/search.c b/src/search.c
index d18a416b..74b3760f 100644
--- a/src/search.c
+++ b/src/search.c
@@ -690,8 +690,12 @@ static double cu_rd_cost_tr_split_accurate(
     
     cu_loc_t split_cu_loc[4];
     const int split_count= uvg_get_split_locs(cu_loc, split, split_cu_loc,NULL);
+    cu_loc_t split_chroma_cu_loc[4];
+    if (chroma_loc) {
+      uvg_get_split_locs(chroma_loc, split, split_chroma_cu_loc, NULL);
+    }
     for (int i = 0; i < split_count; ++i) {
-      sum += cu_rd_cost_tr_split_accurate(state, pred_cu, lcu, tree_type, isp_cbf, &split_cu_loc[i], &split_cu_loc[i], has_chroma);
+      sum += cu_rd_cost_tr_split_accurate(state, pred_cu, lcu, tree_type, isp_cbf, &split_cu_loc[i], chroma_loc ? &split_chroma_cu_loc[i] : NULL, has_chroma);
     }
     return sum + tr_tree_bits * state->lambda;
   }
@@ -751,7 +755,10 @@ static double cu_rd_cost_tr_split_accurate(
   }
   // Chroma transform skip enable/disable is non-normative, so we need to count the chroma
   // tr-skip bits even when we are never using it.
-  const bool can_use_tr_skip = state->encoder_control->cfg.trskip_enable && width <= (1 << state->encoder_control->cfg.trskip_max_size) && !is_isp;
+  const bool can_use_tr_skip = state->encoder_control->cfg.trskip_enable
+                               && width <= (1 << state->encoder_control->cfg.trskip_max_size)
+                               && height <= (1 << state->encoder_control->cfg.trskip_max_size)
+                               && !is_isp;
 
   if(cb_flag_y){
     if (can_use_tr_skip) {
@@ -860,7 +867,7 @@ static double cu_rd_cost_tr_split_accurate(
   }
 
   const bool is_chroma_tree = is_local_sep_tree || tree_type == UVG_CHROMA_T;
-  if (uvg_is_lfnst_allowed(state, tr_cu, is_local_sep_tree ? UVG_CHROMA_T : tree_type, is_chroma_tree ? COLOR_UV : COLOR_Y, is_chroma_tree ? cu_loc : chroma_loc, lcu)) {
+  if (uvg_is_lfnst_allowed(state, tr_cu, is_local_sep_tree ? UVG_CHROMA_T : tree_type, is_chroma_tree ? COLOR_UV : COLOR_Y, is_chroma_tree ? chroma_loc : cu_loc, lcu)) {
     const int lfnst_idx = is_chroma_tree ? tr_cu->cr_lfnst_idx : tr_cu->lfnst_idx;
     CABAC_FBITS_UPDATE(
       cabac,
@@ -1413,7 +1420,7 @@ static double search_cu(
       split_tree.current_depth < pu_depth_inter.max);
 
   if(state->encoder_control->cabac_debug_file) {
-    fprintf(state->encoder_control->cabac_debug_file, "S %4d %4d %d %d", x, y, split_tree.current_depth, tree_type);
+    fprintf(state->encoder_control->cabac_debug_file, "S %4d %4d %9d %d", x, y, split_tree.split_tree, tree_type);
     fwrite(&state->search_cabac.ctx, 1,  sizeof(state->search_cabac.ctx), state->encoder_control->cabac_debug_file);
   }
 
diff --git a/tests/check_cabac_state_consistency.py b/tests/check_cabac_state_consistency.py
index 4d7f970c..73a1dd72 100644
--- a/tests/check_cabac_state_consistency.py
+++ b/tests/check_cabac_state_consistency.py
@@ -30,7 +30,7 @@ def main(state_file: Path, ctx_names: list, ctx_count: int = 332, ctx_size: int
     with open(state_file, "rb") as file:
         try:
             while True:
-                type_, x, y, depth, tree_type = file.read(15).decode().split()
+                type_, x, y, depth, tree_type = file.read(23).decode().split()
                 # Reset stored data at the beginning of the frame
                 if x == '0' and y == '0' and type_ == "S" and tree_type != "2":
                     if not was_zero_last:
@@ -38,7 +38,7 @@ def main(state_file: Path, ctx_names: list, ctx_count: int = 332, ctx_size: int
                         ctx_store = dict()
                         e_store = set()
                     was_zero_last = True
-                else:
+                elif int(x) >= 64 and int(y) >= 64:
                     was_zero_last = False
 
                 ctx = file.read(ctx_count * ctx_size)

From 6620ba8d76bd3eab4d920eee5b812762c869952e Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Tue, 13 Dec 2022 14:51:38 +0200
Subject: [PATCH 139/254] [mtt] fix deblock

---
 src/cu.h     |   8 ++-
 src/filter.c |  64 +++++++++++----------
 src/filter.h |   4 +-
 src/search.c | 157 +++++++++++++++++++++++++++++++++++++++++++++++----
 4 files changed, 189 insertions(+), 44 deletions(-)

diff --git a/src/cu.h b/src/cu.h
index 751a483c..a2e2234c 100644
--- a/src/cu.h
+++ b/src/cu.h
@@ -131,6 +131,9 @@ typedef struct
   uint8_t log2_width : 3;
   uint8_t log2_height : 3;
 
+  uint8_t log2_chroma_width : 3;
+  uint8_t log2_chroma_height : 3;
+
   uint16_t cbf;
 
   uint8_t root_cbf;
@@ -150,11 +153,14 @@ typedef struct
   uint8_t mts_last_scan_pos : 1;
 
   uint8_t violates_lfnst_constrained_luma : 1;
-  uint8_t violates_lfnst_constrained_chroma;
+  uint8_t violates_lfnst_constrained_chroma : 1;
   uint8_t lfnst_last_scan_pos : 1;
   uint8_t lfnst_idx : 2;
   uint8_t cr_lfnst_idx : 2;
 
+  uint8_t luma_deblocking : 2;
+  uint8_t chroma_deblocking : 2;
+
   union {
     struct {
       int8_t mode;
diff --git a/src/filter.c b/src/filter.c
index 2f0b6a1c..5605006b 100644
--- a/src/filter.c
+++ b/src/filter.c
@@ -269,6 +269,7 @@ static bool is_tu_boundary(
   int32_t x,
   int32_t y,
   edge_dir dir,
+  color_t color,
   enum uvg_tree_type tree_type)
 {
   x >>= tree_type == UVG_CHROMA_T;
@@ -276,13 +277,13 @@ static bool is_tu_boundary(
   // if (x & 3 || y & 3) return false;
   const cu_info_t *const scu =
     uvg_cu_array_at_const(tree_type != UVG_CHROMA_T ? state->tile->frame->cu_array : state->tile->frame->chroma_cu_array, x, y);
-  const int tu_width = MIN(TR_MAX_WIDTH, 1 << scu->log2_width);
-  const int tu_height = MIN(TR_MAX_WIDTH, 1 << scu->log2_height);
 
   if (dir == EDGE_HOR) {
-    return (y & (tu_height - 1)) == 0;
+    return color == COLOR_Y ? scu->luma_deblocking & EDGE_HOR :
+                              scu->chroma_deblocking & EDGE_HOR;
   } else {
-    return (x & (tu_width - 1)) == 0;
+    return color == COLOR_Y ? scu->luma_deblocking & EDGE_VER :
+                              scu->chroma_deblocking & EDGE_VER;
   }
 }
 
@@ -321,9 +322,9 @@ static bool is_pu_boundary(const encoder_state_t *const state,
 static bool is_on_8x8_grid(int x, int y, edge_dir dir)
 {
   if (dir == EDGE_HOR) {
-    return (y & 7) == 0 && (x & 2) == 0;
+    return (y & 7) == 0;
   } else {
-    return (x & 7) == 0 && (y & 2) == 0;
+    return (x & 7) == 0;
   }
 }
 
@@ -603,10 +604,10 @@ static INLINE void get_max_filter_length(uint8_t *filt_len_P, uint8_t *filt_len_
   bool transform_edge_4x4[2] = { false, false };
   bool transform_edge_8x8[2] = { false, false };
   
-  if (pos >= 4) transform_edge_4x4[0] = is_tu_boundary(state, x - x_mul * 4, y - y_mul * 4, dir, tree_type);
-  if (pos >= 8) transform_edge_8x8[0] = is_tu_boundary(state, x - x_mul * 8, y - y_mul * 8, dir, tree_type);
-  if (pos + 4 < len) transform_edge_4x4[1] = is_tu_boundary(state, x + x_mul * 4, y + y_mul * 4, dir, tree_type);
-  if (pos + 8 < len) transform_edge_8x8[1] = is_tu_boundary(state, x + x_mul * 8, y + y_mul * 8, dir, tree_type);
+  if (pos >= 4) transform_edge_4x4[0] = is_tu_boundary(state, x - x_mul * 4, y - y_mul * 4, dir, comp, tree_type);
+  if (pos >= 8) transform_edge_8x8[0] = is_tu_boundary(state, x - x_mul * 8, y - y_mul * 8, dir, comp, tree_type);
+  if (pos + 4 < len) transform_edge_4x4[1] = is_tu_boundary(state, x + x_mul * 4, y + y_mul * 4, dir, comp, tree_type);
+  if (pos + 8 < len) transform_edge_8x8[1] = is_tu_boundary(state, x + x_mul * 8, y + y_mul * 8, dir, comp, tree_type);
 
   if (comp == COLOR_Y) {
     if (tu_size_P_side <= 4 || tu_size_Q_side <= 4){
@@ -1066,18 +1067,18 @@ static void filter_deblock_edge_chroma(encoder_state_t * const state,
       uint8_t max_filter_length_P = 0;
       uint8_t max_filter_length_Q = 0;
       
-      const int cu_width = 1 << (cu_q->log2_width - (tree_type != UVG_CHROMA_T));
-      const int cu_height = 1 << (cu_q->log2_height - (tree_type != UVG_CHROMA_T));
+      const int cu_width = 1 << (cu_q->log2_chroma_width );
+      const int cu_height = 1 << (cu_q->log2_chroma_height);
       const int pu_size = dir == EDGE_HOR ? cu_height : cu_width;
       const int pu_pos = dir == EDGE_HOR ? y_coord : x_coord;
 
 
       const int tu_size_p_side = dir == EDGE_HOR ? 
-        MIN(1 << (cu_p->log2_height - (tree_type != UVG_CHROMA_T)), TR_MAX_WIDTH) :
-        MIN(1 << (cu_p->log2_width - (tree_type != UVG_CHROMA_T)), TR_MAX_WIDTH);
+        MIN(1 << (cu_p->log2_chroma_height), TR_MAX_WIDTH) :
+        MIN(1 << (cu_p->log2_chroma_width), TR_MAX_WIDTH);
       const int tu_size_q_side = dir == EDGE_HOR ?
-        MIN(1 << (cu_q->log2_height - (tree_type != UVG_CHROMA_T)), TR_MAX_WIDTH) :
-        MIN(1 << (cu_q->log2_width - (tree_type != UVG_CHROMA_T)), TR_MAX_WIDTH);
+        MIN(1 << (cu_q->log2_chroma_height ), TR_MAX_WIDTH) :
+        MIN(1 << (cu_q->log2_chroma_width ), TR_MAX_WIDTH);
 
       get_max_filter_length(&max_filter_length_P, &max_filter_length_Q, state, x_coord, y_coord,
                             dir, tu_boundary, tu_size_p_side, tu_size_q_side,
@@ -1216,11 +1217,12 @@ static void filter_deblock_unit(
   // Chroma pixel coordinates.
   const int32_t x_c = x >> 1;
   const int32_t y_c = y >> 1;
-  if (state->encoder_control->chroma_format != UVG_CSP_400 && 
-    (is_on_8x8_grid(x_c, y_c, dir && (x_c + 4) % 32)
-     || (x == state->tile->frame->width - 8 && dir == 1 && y_c % 8 == 0)) 
+  if (state->encoder_control->chroma_format != UVG_CSP_400 &&
+    is_tu_boundary(state, x, y, dir, COLOR_UV, tree_type)
+    && (is_on_8x8_grid(x_c, y_c, dir == EDGE_HOR && (x_c + 4) % 32 ? EDGE_HOR : EDGE_VER)
+     || (x == state->tile->frame->width - 8 && dir == EDGE_HOR && y_c % 8 == 0)) 
     && tree_type != UVG_LUMA_T) {
-    filter_deblock_edge_chroma(state, x_c, y_c, length, dir, tu_boundary, tree_type);
+    filter_deblock_edge_chroma(state, x_c, y_c, 2, dir, tu_boundary, tree_type);
   }
 }
 
@@ -1250,11 +1252,11 @@ static void filter_deblock_lcu_inside(encoder_state_t * const state,
 
   for (int edge_y = y; edge_y < end_y; edge_y += 4) {
     for (int edge_x = x; edge_x < end_x; edge_x += 4) {
-      bool tu_boundary = is_tu_boundary(state, edge_x, edge_y, dir, luma_tree);
+      bool tu_boundary = is_tu_boundary(state, edge_x, edge_y, dir, COLOR_Y, luma_tree);
       if (tu_boundary || is_pu_boundary(state, edge_x, edge_y, dir)) {
         filter_deblock_unit(state, edge_x, edge_y, 4, 4, dir, tu_boundary, edge_x < x, luma_tree);
       }
-      if(chroma_tree == UVG_CHROMA_T && is_tu_boundary(state, edge_x, edge_y, dir, chroma_tree)) {
+      if(chroma_tree == UVG_CHROMA_T && is_tu_boundary(state, edge_x, edge_y, dir, COLOR_UV, chroma_tree)) {
         filter_deblock_unit(state, edge_x, edge_y, 4, 4, dir, tu_boundary, edge_x < x, chroma_tree);        
       }
     }
@@ -1281,7 +1283,7 @@ static void filter_deblock_lcu_rightmost(encoder_state_t * const state,
   for (int x = x_px - 8; x < x_px; x += 4) {
     for (int y = y_px; y < end; y += 4) {
       // The top edge of the whole frame is not filtered.
-      bool tu_boundary = is_tu_boundary(state, x, y, EDGE_HOR, luma_tree);
+      bool tu_boundary = is_tu_boundary(state, x, y, EDGE_HOR, COLOR_Y, luma_tree);
       if (y > 0 && (tu_boundary || is_pu_boundary(state, x, y, EDGE_HOR))) {
         filter_deblock_edge_luma(state, x, y, 4, EDGE_HOR, tu_boundary);
       }
@@ -1292,13 +1294,15 @@ static void filter_deblock_lcu_rightmost(encoder_state_t * const state,
   if (state->encoder_control->chroma_format != UVG_CSP_400) {
     const int x_px_c = x_px >> 1;
     const int y_px_c = y_px >> 1;
-    const int x_c = x_px_c - 4;
-    const int end_c = MIN(y_px_c + LCU_WIDTH_C, state->tile->frame->height >> 1);
-    for (int y_c = y_px_c; y_c < end_c; y_c += 8) {
-      // The top edge of the whole frame is not filtered.
-      bool tu_boundary = is_tu_boundary(state, x_c << 1, y_c << 1, EDGE_HOR, chroma_tree);
-      if (y_c > 0 && (tu_boundary || is_pu_boundary(state, x_c << 1, y_c << 1, EDGE_HOR))) {
-        filter_deblock_edge_chroma(state, x_c , y_c, 4, EDGE_HOR, tu_boundary, chroma_tree);
+    int x_c = x_px_c - 4;
+    const int end_c_y = MIN(y_px_c + LCU_WIDTH_C, state->tile->frame->height >> 1);
+    for(; x_c < x_px_c; x_c += 2) {
+      for (int y_c = y_px_c; y_c < end_c_y; y_c += 8) {
+        // The top edge of the whole frame is not filtered.
+        bool tu_boundary = is_tu_boundary(state, x_c << 1, y_c << 1, EDGE_HOR, COLOR_UV, chroma_tree);
+        if (y_c > 0 && (tu_boundary || is_pu_boundary(state, x_c << 1, y_c << 1, EDGE_HOR))) {
+          filter_deblock_edge_chroma(state, x_c , y_c, 2, EDGE_HOR, tu_boundary, chroma_tree);
+        }
       }
     }
   }
diff --git a/src/filter.h b/src/filter.h
index 0d98eedd..2db9c871 100644
--- a/src/filter.h
+++ b/src/filter.h
@@ -46,8 +46,8 @@
  * \brief Edge direction.
  */
 typedef enum edge_dir {
-  EDGE_VER = 0, // vertical
-  EDGE_HOR = 1, // horizontal
+  EDGE_VER = 1, // vertical
+  EDGE_HOR = 2, // horizontal
 } edge_dir;
 
 
diff --git a/src/search.c b/src/search.c
index 74b3760f..18c721c6 100644
--- a/src/search.c
+++ b/src/search.c
@@ -39,6 +39,7 @@
 #include "cu.h"
 #include "encoder.h"
 #include "encode_coding_tree.h"
+#include "filter.h"
 #include "imagelist.h"
 #include "inter.h"
 #include "intra.h"
@@ -253,6 +254,10 @@ static void work_tree_copy_up(
         to_cu->intra.mode_chroma = from_cu->intra.mode_chroma;
         to_cu->joint_cb_cr = from_cu->joint_cb_cr;
         to_cu->cr_lfnst_idx = from_cu->cr_lfnst_idx;
+        to_cu->chroma_deblocking = from_cu->chroma_deblocking;
+        to_cu->log2_chroma_width = from_cu->log2_chroma_width;
+        to_cu->log2_chroma_height = from_cu->log2_chroma_height;
+
         cbf_copy(&to_cu->cbf, from_cu->cbf, COLOR_U);
         cbf_copy(&to_cu->cbf, from_cu->cbf, COLOR_V);
       }
@@ -282,6 +287,9 @@ static void lcu_fill_cu_info(lcu_t *lcu, int x_local, int y_local, int width, in
       to->log2_height = cu->log2_height;
       to->log2_width = cu->log2_width;
 
+      to->log2_chroma_height = cu->log2_chroma_height;
+      to->log2_chroma_width = cu->log2_chroma_width;
+
       if (cu->type == CU_INTRA) {
         to->intra.mode        = cu->intra.mode;
         to->intra.mode_chroma = cu->intra.mode_chroma;
@@ -315,14 +323,37 @@ static void lcu_fill_chroma_cu_info(lcu_t *lcu, const cu_loc_t * const cu_loc)
       cu->intra.mode_chroma = bottom_right->intra.mode_chroma;
       cu->joint_cb_cr       = bottom_right->joint_cb_cr;
       cu->cr_lfnst_idx      = bottom_right->cr_lfnst_idx;
+      cu->log2_chroma_height = bottom_right->log2_chroma_height;
+      cu->log2_chroma_width = bottom_right->log2_chroma_width;
       cu->type = bottom_right->type;
       cu->tr_skip |= bottom_right->tr_skip & 6;
     }
   }
 }
-  
 
 
+static void lcu_fill_chroma_cbfs(lcu_t *lcu, const cu_loc_t * const chroma_loc, enum uvg_tree_type tree_type)
+{
+  int8_t height = tree_type == UVG_CHROMA_T ? chroma_loc->chroma_height : chroma_loc->height;
+  int8_t width = tree_type == UVG_CHROMA_T ? chroma_loc->chroma_width : chroma_loc->width;
+  uint32_t x_local = chroma_loc->local_x;
+  uint32_t y_local = chroma_loc->local_y;
+  const int offset = ~((TR_MAX_WIDTH >> (tree_type == UVG_CHROMA_T)) - 1);
+  // Set coeff flags in every CU covered by part_mode in this depth.
+  for (uint32_t y = 0; y < height; y += SCU_WIDTH) {
+    for (uint32_t x = 0; x < width; x += SCU_WIDTH) {
+      // Use TU top-left CU to propagate coeff flags
+      cu_info_t* cu_from = LCU_GET_CU_AT_PX(lcu, x_local + (x & offset), y_local + (y & offset));
+      cu_info_t* cu_to = LCU_GET_CU_AT_PX(lcu, x_local + x, y_local + y);
+      if (cu_from != cu_to) {
+        cbf_copy(&cu_to->cbf, cu_from->cbf, COLOR_U);
+        cbf_copy(&cu_to->cbf, cu_from->cbf, COLOR_V);
+      }
+    }
+  }
+  
+}
+
 static void lcu_fill_cbf(lcu_t *lcu, int x_local, unsigned y_local, unsigned width, unsigned height, const cu_info_t *cur_cu, enum
                          uvg_tree_type tree_type)
 {
@@ -996,6 +1027,97 @@ void uvg_sort_keys_by_cost(unit_stats_map_t *__restrict map)
 }
 
 
+static void mark_deblocking(const cu_loc_t* const cu_loc, const cu_loc_t* const chroma_loc, lcu_t* lcu, enum uvg_tree_type tree_type, bool has_chroma, const bool is_separate_tree, int x_local, int y_local)
+{
+  if(tree_type != UVG_CHROMA_T) {
+    if(cu_loc->x) {
+      for (int x = cu_loc->local_x; x < cu_loc->local_x + cu_loc->width; x += TR_MAX_WIDTH) {
+        for (int y = cu_loc->local_y; y < cu_loc->local_y + cu_loc->height; y += SCU_WIDTH) {
+          LCU_GET_CU_AT_PX(lcu, x, y)->luma_deblocking |= EDGE_VER;
+          if(!is_separate_tree && tree_type == UVG_BOTH_T) LCU_GET_CU_AT_PX(lcu, x, y)->chroma_deblocking |= EDGE_VER;
+        }
+      }
+    }
+    else if(cu_loc->width == 64) {
+      for (int y = cu_loc->local_y; y < cu_loc->local_y + cu_loc->height; y += SCU_WIDTH) {
+        LCU_GET_CU_AT_PX(lcu, TR_MAX_WIDTH, y)->luma_deblocking |= EDGE_VER;
+        if (!is_separate_tree && tree_type == UVG_BOTH_T) LCU_GET_CU_AT_PX(lcu, TR_MAX_WIDTH, y)->chroma_deblocking |= EDGE_VER;
+      }        
+    }
+
+    if(cu_loc->y) {
+      for (int y = cu_loc->local_y; y < cu_loc->local_y + cu_loc->height; y += TR_MAX_WIDTH) {
+        for (int x = cu_loc->local_x; x < cu_loc->local_x + cu_loc->width; x += SCU_WIDTH) {
+          LCU_GET_CU_AT_PX(lcu, x, y)->luma_deblocking |= EDGE_HOR;
+          if (!is_separate_tree && tree_type == UVG_BOTH_T) LCU_GET_CU_AT_PX(lcu, x, y)->chroma_deblocking |= EDGE_HOR;
+        }
+      }
+    }
+    else if (cu_loc->height == 64) {
+      for (int x = cu_loc->local_x; x < cu_loc->local_x + cu_loc->width; x += SCU_WIDTH) {
+        LCU_GET_CU_AT_PX(lcu, x, TR_MAX_WIDTH)->luma_deblocking |= EDGE_VER;
+        if (!is_separate_tree && tree_type == UVG_BOTH_T) LCU_GET_CU_AT_PX(lcu, x, TR_MAX_WIDTH)->chroma_deblocking |= EDGE_VER;
+      }
+    }
+
+    if(is_separate_tree && has_chroma) {
+      if (chroma_loc->x) {
+        for (int x = chroma_loc->local_x; x < chroma_loc->local_x + chroma_loc->width; x += TR_MAX_WIDTH) {
+          for (int y = chroma_loc->local_y; y < chroma_loc->local_y + chroma_loc->height; y += SCU_WIDTH) {
+            LCU_GET_CU_AT_PX(lcu, x, y)->chroma_deblocking |= EDGE_VER;
+          }
+        }
+      }
+      else if(cu_loc->width == 64) {
+        for (int y = chroma_loc->local_y; y < chroma_loc->local_y + chroma_loc->height; y += SCU_WIDTH) {
+          LCU_GET_CU_AT_PX(lcu, TR_MAX_WIDTH, y)->chroma_deblocking |= EDGE_VER;
+        }          
+      }
+
+      if (chroma_loc->y) {
+        for (int y = chroma_loc->local_y; y < chroma_loc->local_y + chroma_loc->height; y += TR_MAX_WIDTH) {
+          for (int x = chroma_loc->local_x; x < chroma_loc->local_x + chroma_loc->width; x += SCU_WIDTH) {
+            LCU_GET_CU_AT_PX(lcu, x, y)->chroma_deblocking |= EDGE_HOR;
+          }
+        }
+      }
+      else if (cu_loc->height == 64) {
+        for (int x = chroma_loc->local_x; x < chroma_loc->local_x + chroma_loc->width; x += SCU_WIDTH) {
+          LCU_GET_CU_AT_PX(lcu, x, TR_MAX_WIDTH)->chroma_deblocking |= EDGE_VER;
+        }
+      }
+    }
+  }
+  else {
+
+    if (chroma_loc->x) {
+      for (int x = x_local; x < x_local + chroma_loc->chroma_width; x += TR_MAX_WIDTH / 2) {
+        for (int y = y_local; y < y_local + chroma_loc->chroma_height; y += SCU_WIDTH) {
+          LCU_GET_CU_AT_PX(lcu, x, y)->chroma_deblocking |= EDGE_VER;
+        }
+      }
+    }
+    else if(chroma_loc->width == 64) {
+      for (int y = y_local; y < y_local + chroma_loc->chroma_height; y += SCU_WIDTH) {
+        LCU_GET_CU_AT_PX(lcu, TR_MAX_WIDTH / 2, y)->chroma_deblocking |= EDGE_VER;
+      }        
+    }
+
+    if(chroma_loc->y) {
+      for (int y = y_local; y < y_local + chroma_loc->chroma_height; y += TR_MAX_WIDTH / 2) {
+        for (int x = x_local; x < x_local + chroma_loc->chroma_width; x += SCU_WIDTH) {
+          LCU_GET_CU_AT_PX(lcu, x, y)->chroma_deblocking |= EDGE_HOR;
+        }
+      }        
+    }
+    else if (chroma_loc->height == 64) {
+      for (int x = x_local; x < x_local + chroma_loc->chroma_width; x += SCU_WIDTH) {
+        LCU_GET_CU_AT_PX(lcu, x, TR_MAX_WIDTH / 2)->chroma_deblocking |= EDGE_VER;
+      }
+    }
+  }
+}
+
 /**
  * Search every mode from 0 to MAX_PU_DEPTH and return cost of best mode.
  * - The recursion is started at depth 0 and goes in Z-order to MAX_PU_DEPTH.
@@ -1090,6 +1212,11 @@ static double search_cu(
   cur_cu->log2_width = uvg_g_convert_to_log2[cu_width];
   cur_cu->log2_height = uvg_g_convert_to_log2[cu_height];
 
+  if(chroma_loc) {
+    cur_cu->log2_chroma_height = uvg_g_convert_to_log2[chroma_loc->chroma_height];
+    cur_cu->log2_chroma_width = uvg_g_convert_to_log2[chroma_loc->chroma_width];
+  }
+
   // If the CU is completely inside the frame at this depth, search for
   // prediction modes at this depth.
   if ( x + luma_width <= frame_width && y + luma_height <= frame_height)
@@ -1269,23 +1396,21 @@ static double search_cu(
       if((!recon_chroma && state->encoder_control->chroma_format != UVG_CSP_400 && tree_type != UVG_LUMA_T) 
         || tree_type == UVG_CHROMA_T) {
         intra_search.pred_cu.intra.mode_chroma = cur_cu->intra.mode_chroma;
-        lcu_fill_chroma_cu_info(
-          lcu,
-          chroma_loc);
+        if(tree_type != UVG_CHROMA_T) {
+          lcu_fill_chroma_cu_info(
+            lcu,
+            chroma_loc);
+        }
         uvg_intra_recon_cu(state,
                            &intra_search, chroma_loc,
                            NULL, lcu,
                            UVG_CHROMA_T,
                            false,
                            true);
-        lcu_fill_cbf(
+        lcu_fill_chroma_cbfs(
           lcu,
-          chroma_loc->local_x,
-          chroma_loc->local_y,
-          chroma_loc->width,
-          chroma_loc->height,
-          cur_cu,
-          UVG_CHROMA_T);
+          chroma_loc,
+          tree_type);
       } else {
         assert(cur_cu->cr_lfnst_idx == 0 && "If we don't have separate tree chroma lfnst index must be 0");
       }
@@ -1409,6 +1534,16 @@ static double search_cu(
     //  lcu_fill_cbf(lcu, x_local, y_local, cu_width, cur_cu);
     //}
     cabac->update = 0;
+
+    mark_deblocking(
+      cu_loc,
+      chroma_loc,
+      lcu,
+      tree_type,
+      has_chroma,
+      is_separate_tree,
+      x_local,
+      y_local);
   } 
 
   bool can_split_cu =

From b69e9b29583e44e5af977b1183f5d6260489178a Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 14 Dec 2022 08:37:06 +0200
Subject: [PATCH 140/254] [mtt] Fix final issues?

---
 src/encode_coding_tree.c | 2 +-
 src/global.h             | 2 +-
 src/intra.c              | 7 ++-----
 3 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index 3be715d4..65efa06e 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -1439,7 +1439,7 @@ void uvg_encode_coding_tree(
   
   DBG_YUVIEW_VALUE(state->frame->poc, DBG_YUVIEW_CU_TYPE, abs_x, abs_y, cu_width, cu_height, cur_cu->type-1);
 
-  fprintf(stderr, "%4d %4d %2d %2d %d %d\n", x, y, cu_width, cu_height, has_chroma, cur_cu->split_tree);
+  // fprintf(stderr, "%4d %4d %2d %2d %d %d\n", x, y, cu_width, cu_height, has_chroma, cur_cu->split_tree);
 
   if (ctrl->cfg.lossless) {
     cabac->cur_ctx = &cabac->ctx.cu_transquant_bypass;
diff --git a/src/global.h b/src/global.h
index 27058463..972b7e82 100644
--- a/src/global.h
+++ b/src/global.h
@@ -129,7 +129,7 @@ typedef int16_t coeff_t;
 typedef int32_t mv_t;
 
 //#define VERBOSE 1
-#define UVG_DEBUG_PRINT_CABAC 1
+//#define UVG_DEBUG_PRINT_CABAC 1
 //#define UVG_DEBUG 1
 
 //#define UVG_DEBUG_PRINT_YUVIEW_CSV 1
diff --git a/src/intra.c b/src/intra.c
index e39878df..4b6d056a 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -551,11 +551,8 @@ static void predict_cclm(
   const uvg_pixel *y_rec = lcu->rec.y + x_scu + y_scu * LCU_WIDTH;
   const int stride2 = (((state->tile->frame->width + 7) & ~7) + FRAME_PADDING_LUMA);
 
-  // Essentially what this does is that it uses 6-tap filtering to downsample
-  // the luma intra references down to match the resolution of the chroma channel.
-  // The luma reference is only needed when we are not on the edge of the picture.
-  // Because the reference pixels that are needed on the edge of the ctu this code
-  // is kinda messy but what can you do
+  tree_type = state->encoder_control->cfg.dual_tree && state->frame->slicetype == UVG_SLICE_I ? tree_type : UVG_BOTH_T;
+
   const int ctu_size = tree_type == UVG_CHROMA_T ? LCU_WIDTH_C : LCU_WIDTH;
 
   if (y0) {

From 05218bae212d01708693884cadf676f1ca388447 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 14 Dec 2022 08:47:24 +0200
Subject: [PATCH 141/254] [jccr] jccr=4 hasn't been necessary for a long time

---
 src/cu.h           | 2 +-
 src/search_intra.c | 3 ---
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/cu.h b/src/cu.h
index a2e2234c..bf89e477 100644
--- a/src/cu.h
+++ b/src/cu.h
@@ -126,7 +126,7 @@ typedef struct
   uint8_t merge_idx   : 3; //!< \brief merge index
   uint8_t tr_skip     : 3; //!< \brief transform skip flag
   uint8_t tr_idx      : 3; //!< \brief transform index
-  uint8_t joint_cb_cr : 3; //!< \brief joint chroma residual coding 
+  uint8_t joint_cb_cr : 2; //!< \brief joint chroma residual coding 
 
   uint8_t log2_width : 3;
   uint8_t log2_height : 3;
diff --git a/src/search_intra.c b/src/search_intra.c
index f497ea4e..0fe33d58 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -338,7 +338,6 @@ static double search_intra_trdepth(
       num_transforms = MAX(num_transforms, 2);
     }
     pred_cu->intra.mode_chroma = -1;
-    pred_cu->joint_cb_cr = 4;
     
     const int max_tb_size = TR_MAX_WIDTH;
     // LFNST search params
@@ -489,7 +488,6 @@ static double search_intra_trdepth(
       if (reconstruct_chroma) {
         int8_t luma_mode = pred_cu->intra.mode;
         pred_cu->intra.mode_chroma = chroma_mode;
-        pred_cu->joint_cb_cr = 4;
         // TODO: Maybe check the jccr mode here also but holy shit is the interface of search_intra_rdo bad currently
         uvg_intra_recon_cu(
           state,
@@ -544,7 +542,6 @@ static double search_intra_trdepth(
     if(reconstruct_chroma) {
       int8_t luma_mode = pred_cu->intra.mode;
       pred_cu->intra.mode_chroma = chroma_mode;
-      pred_cu->joint_cb_cr= 4; // TODO: Maybe check the jccr mode here also but holy shit is the interface of search_intra_rdo bad currently
       uvg_intra_recon_cu(state,
                          search_data, cu_loc,
                          pred_cu, lcu,

From f6ecb15cede7c386e47e00c9ce2ae4e8f924c4fc Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 14 Dec 2022 09:01:52 +0200
Subject: [PATCH 142/254] [mtt] Fix implicit splits when mtt is not enabled

---
 src/cu.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/cu.c b/src/cu.c
index d4ef2881..56c8135b 100644
--- a/src/cu.c
+++ b/src/cu.c
@@ -370,8 +370,12 @@ int uvg_get_split_locs(
 }
 
 
-int uvg_get_implicit_split(const encoder_state_t* const state, const cu_loc_t* const cu_loc, enum
-                           uvg_tree_type tree_type)
+int uvg_get_implicit_split(
+  const encoder_state_t* const state,
+  const cu_loc_t* const cu_loc,
+  enum
+  uvg_tree_type tree_type,
+  uint8_t max_mtt_depth)
 {
   // This checking if cabac is in update state is a very dirty way of checking
   // whether we are in the search or writing the bitstream, and unfortunately the
@@ -383,8 +387,8 @@ int uvg_get_implicit_split(const encoder_state_t* const state, const cu_loc_t* c
   bool bottom_ok = (state->tile->frame->height >> (tree_type == UVG_CHROMA_T && state->cabac.update)) >= cu_loc->y + cu_loc->height;
 
   if (right_ok && bottom_ok) return NO_SPLIT;
-  if (right_ok) return BT_HOR_SPLIT;
-  if (bottom_ok) return BT_VER_SPLIT;
+  if (right_ok && max_mtt_depth != 0) return BT_HOR_SPLIT;
+  if (bottom_ok && max_mtt_depth != 0) return BT_VER_SPLIT;
   return QT_SPLIT;
 }
 
@@ -394,7 +398,6 @@ int uvg_get_possible_splits(const encoder_state_t * const state,
 {
   const int width = tree_type != UVG_CHROMA_T ? cu_loc->width : cu_loc->chroma_width;
   const int height = tree_type != UVG_CHROMA_T ? cu_loc->height : cu_loc->chroma_height;
-  const enum split_type implicitSplit = uvg_get_implicit_split(state, cu_loc, tree_type);
   const int slice_type = state->frame->is_irap ? (tree_type == UVG_CHROMA_T ? 2 : 0) : 1;
 
   const unsigned max_btd =
@@ -404,6 +407,8 @@ int uvg_get_possible_splits(const encoder_state_t * const state,
   const unsigned max_tt_size = state->encoder_control->cfg.max_tt_size[slice_type] >> (tree_type == UVG_CHROMA_T);
   const unsigned min_tt_size = 1 << MIN_SIZE >> (tree_type == UVG_CHROMA_T);
   const unsigned min_qt_size = state->encoder_control->cfg.min_qt_size[slice_type];
+
+  const enum split_type implicitSplit = uvg_get_implicit_split(state, cu_loc, tree_type, max_btd);
   
   splits[NO_SPLIT] = splits[QT_SPLIT] = splits[BT_HOR_SPLIT] = splits[TT_HOR_SPLIT] = splits[BT_VER_SPLIT] = splits[TT_VER_SPLIT] = true;
   bool can_btt = split_tree.mtt_depth < max_btd;

From 90ce1390c0b4309aefe6d7be57b684d67f56380c Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 14 Dec 2022 09:11:40 +0200
Subject: [PATCH 143/254] [mtt] static

---
 src/strategies/generic/picture-generic.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/strategies/generic/picture-generic.c b/src/strategies/generic/picture-generic.c
index d6e3c81c..5e06ebbe 100644
--- a/src/strategies/generic/picture-generic.c
+++ b/src/strategies/generic/picture-generic.c
@@ -475,7 +475,7 @@ SATD_DUAL_NXN(64, uvg_pixel)
 
 SATD_ANY_SIZE_MULTI_GENERIC(quad_generic, 4)
 
-uint64_t xCalcHADs2x2(const uvg_pixel* piOrg, const uvg_pixel* piCur, int iStrideOrg, int iStrideCur)
+static uint64_t xCalcHADs2x2(const uvg_pixel* piOrg, const uvg_pixel* piCur, int iStrideOrg, int iStrideCur)
 {
   uint64_t satd = 0;
   coeff_t diff[4], m[4];
@@ -943,7 +943,7 @@ static uint64_t xCalcHADs8x4(const uvg_pixel* piOrg, const uvg_pixel* piCur, int
 }
 
 
-uint64_t xGetHADs(int width, int height, const uvg_pixel* ref_in, int ref_stride, const uvg_pixel* pred_in, int pred_stride)
+static uint64_t xGetHADs(int width, int height, const uvg_pixel* ref_in, int ref_stride, const uvg_pixel* pred_in, int pred_stride)
 {
   const uvg_pixel* piOrg = ref_in;
   const uvg_pixel* piCur = pred_in;

From 9c2574880ad56a817ce80344b2c853afd9793293 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 14 Dec 2022 09:25:54 +0200
Subject: [PATCH 144/254] [mtt] Fix deblock for --combine-intra

---
 src/search.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/search.c b/src/search.c
index 18c721c6..4970cbc3 100644
--- a/src/search.c
+++ b/src/search.c
@@ -1721,6 +1721,8 @@ static double search_cu(
 
         cost += cu_rd_cost_tr_split_accurate(state, cur_cu, lcu, tree_type, 0, cu_loc, chroma_loc, has_chroma);
 
+        mark_deblocking(cu_loc, chroma_loc, lcu, tree_type, has_chroma, is_separate_tree, x_local, y_local);
+
         memcpy(&post_seach_cabac, &state->search_cabac, sizeof(post_seach_cabac));
         memcpy(&state->search_cabac, &temp_cabac, sizeof(temp_cabac));
       }

From 567fa7b2bd02368cd8563d0f656fa0f84294917f Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 14 Dec 2022 10:10:08 +0200
Subject: [PATCH 145/254] [deblock] Fix incorrect direction for transform split
 of tall blocks at the top CTU row

---
 src/search.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/search.c b/src/search.c
index 4970cbc3..e5cafa39 100644
--- a/src/search.c
+++ b/src/search.c
@@ -1055,8 +1055,8 @@ static void mark_deblocking(const cu_loc_t* const cu_loc, const cu_loc_t* const
     }
     else if (cu_loc->height == 64) {
       for (int x = cu_loc->local_x; x < cu_loc->local_x + cu_loc->width; x += SCU_WIDTH) {
-        LCU_GET_CU_AT_PX(lcu, x, TR_MAX_WIDTH)->luma_deblocking |= EDGE_VER;
-        if (!is_separate_tree && tree_type == UVG_BOTH_T) LCU_GET_CU_AT_PX(lcu, x, TR_MAX_WIDTH)->chroma_deblocking |= EDGE_VER;
+        LCU_GET_CU_AT_PX(lcu, x, TR_MAX_WIDTH)->luma_deblocking |= EDGE_HOR;
+        if (!is_separate_tree && tree_type == UVG_BOTH_T) LCU_GET_CU_AT_PX(lcu, x, TR_MAX_WIDTH)->chroma_deblocking |= EDGE_HOR;
       }
     }
 

From facbc794bf2085ecc78415f74f22ff20b27ceab3 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 14 Dec 2022 10:16:51 +0200
Subject: [PATCH 146/254] [mtt] Fix trying to get split data from depth -1

---
 src/cu.c | 2 +-
 src/cu.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/cu.c b/src/cu.c
index 56c8135b..2697c38c 100644
--- a/src/cu.c
+++ b/src/cu.c
@@ -413,7 +413,7 @@ int uvg_get_possible_splits(const encoder_state_t * const state,
   splits[NO_SPLIT] = splits[QT_SPLIT] = splits[BT_HOR_SPLIT] = splits[TT_HOR_SPLIT] = splits[BT_VER_SPLIT] = splits[TT_VER_SPLIT] = true;
   bool can_btt = split_tree.mtt_depth < max_btd;
   
-  const enum split_type last_split = (split_tree.split_tree >> (split_tree.current_depth * 3 - 3)) & 7;
+  const enum split_type last_split = GET_SPLITDATA(&split_tree, 0);
   const enum split_type parl_split = last_split == TT_HOR_SPLIT ? BT_HOR_SPLIT : BT_VER_SPLIT;
 
   // don't allow QT-splitting below a BT split
diff --git a/src/cu.h b/src/cu.h
index bf89e477..e3668f19 100644
--- a/src/cu.h
+++ b/src/cu.h
@@ -614,7 +614,7 @@ static INLINE void cbf_copy(uint16_t *cbf, uint16_t src, color_t plane)
   *cbf |= src & (1 <<  plane);
 }
 
-#define GET_SPLITDATA(CU,curDepth) (((CU)->split_tree >> ((curDepth) * 3)) & 7)
+#define GET_SPLITDATA(CU,curDepth) ((CU)->split_tree >> ((MAX((curDepth), 0) * 3)) & 7)
 #define PU_IS_TU(cu) ((cu)->log2_width <= TR_MAX_LOG2_SIZE && (cu)->log2_height <= TR_MAX_LOG2_SIZE)
 
 #endif

From 23e6b9f56c5c912aab9fd0ce8f1cc58f138c66db Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 14 Dec 2022 10:23:20 +0200
Subject: [PATCH 147/254] [mtt] Check that we are inside the CTU before
 checking the ctu data

---
 src/cu.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/cu.c b/src/cu.c
index 2697c38c..df89642b 100644
--- a/src/cu.c
+++ b/src/cu.c
@@ -499,12 +499,12 @@ int uvg_count_available_edge_cus(const cu_loc_t* const cu_loc, const lcu_t* cons
   int amount = 0;
   if(left) {
     if (cu_loc->local_y == 0 && cu_loc->local_x == 32 && cu_loc->height == 32 && cu_loc->width == 32) return 8;
-    while (LCU_GET_CU_AT_PX(lcu, cu_loc->local_x - TR_MIN_WIDTH, cu_loc->local_y + amount)->type != CU_NOTSET && (cu_loc->local_y + amount) < LCU_WIDTH) {
+    while (cu_loc->local_y + amount < LCU_WIDTH && LCU_GET_CU_AT_PX(lcu, cu_loc->local_x - TR_MIN_WIDTH, cu_loc->local_y + amount)->type != CU_NOTSET) {
       amount += TR_MIN_WIDTH;
     }
     return MAX(amount / TR_MIN_WIDTH, cu_loc->height / TR_MIN_WIDTH);
   }
-  while (LCU_GET_CU_AT_PX(lcu, cu_loc->local_x + amount, cu_loc->local_y - TR_MIN_WIDTH)->type != CU_NOTSET && cu_loc->local_x + amount < LCU_WIDTH) {
+  while (cu_loc->local_x + amount < LCU_WIDTH && LCU_GET_CU_AT_PX(lcu, cu_loc->local_x + amount, cu_loc->local_y - TR_MIN_WIDTH)->type != CU_NOTSET) {
     amount += TR_MIN_WIDTH;
   }
   return MAX(amount / TR_MIN_WIDTH, cu_loc->width / TR_MIN_WIDTH);
@@ -520,12 +520,12 @@ int uvg_count_chroma_tree_available_edge_cus(int x, int y, int width, int height
 
   int amount = 0;
   if(left) {
-    while (LCU_GET_CU_AT_PX(lcu, local_x - TR_MIN_WIDTH, local_y + amount)->type != CU_NOTSET && (local_y + amount) < LCU_WIDTH_C) {
+    while (local_y + amount < LCU_WIDTH_C && LCU_GET_CU_AT_PX(lcu, local_x - TR_MIN_WIDTH, local_y + amount)->type != CU_NOTSET) {
       amount += TR_MIN_WIDTH;
     }
     return MAX(amount / TR_MIN_WIDTH, height / TR_MIN_WIDTH);
   }
-  while (LCU_GET_CU_AT_PX(lcu, local_x + amount, local_y - TR_MIN_WIDTH)->type != CU_NOTSET && local_x + amount < LCU_WIDTH_C) {
+  while (local_x + amount < LCU_WIDTH_C && LCU_GET_CU_AT_PX(lcu, local_x + amount, local_y - TR_MIN_WIDTH)->type != CU_NOTSET) {
     amount += TR_MIN_WIDTH;
   }
   return MAX(amount / TR_MIN_WIDTH, width / TR_MIN_WIDTH);

From 71516b815525ea6d682df65020948244c5ce0c93 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 14 Dec 2022 10:41:36 +0200
Subject: [PATCH 148/254] [mtt] Make sure mtt splits cannot reach a situation
 where search cannot be performed

---
 src/search.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/src/search.c b/src/search.c
index e5cafa39..b8040666 100644
--- a/src/search.c
+++ b/src/search.c
@@ -1560,7 +1560,24 @@ static double search_cu(
   }
 
   bool can_split[6];
-  uvg_get_possible_splits(state, cu_loc, split_tree, tree_type, can_split);
+  bool is_implicit = uvg_get_possible_splits(state, cu_loc, split_tree, tree_type, can_split);
+
+  const int slice_type = state->frame->is_irap ? (tree_type == UVG_CHROMA_T ? 2 : 0) : 1;
+  const int max_btd = state->encoder_control->cfg.max_btt_depth[slice_type];
+  int minimum_split_amount;
+  switch (slice_type) {
+  case 0: minimum_split_amount = pu_depth_intra.min - split_tree.current_depth; break;
+  case 1: minimum_split_amount = MIN(pu_depth_intra.min, pu_depth_inter.min) - split_tree.current_depth; break;
+  case 2: minimum_split_amount = pu_depth_intra.min - split_tree.current_depth; break;
+    default:
+      assert(0 && "Incorrect_slice_type");
+  }
+  if(minimum_split_amount > max_btd && !is_implicit) {
+    // If search should not be performed at depths that cannot be reached after a maximum mtt split amount
+    // we are in trouble, therefore prevent mtt splits in such situation
+    can_split[2] = can_split[3] = can_split[4] = can_split[5] = false;
+  }
+
   can_split_cu &= can_split[1] || can_split[2] || can_split[3] || can_split[4] || can_split[5];
 
 
@@ -1588,7 +1605,6 @@ static double search_cu(
 
 
       double split_bits = 0;
-      bool is_implicit = false;
 
       if (cur_cu->log2_height + cur_cu->log2_width > 4) {
 

From 06fa86c34000ff657d6d43864403102e06d11093 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 14 Dec 2022 10:54:52 +0200
Subject: [PATCH 149/254] [isp] Fix coordinates

---
 src/encode_coding_tree.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index 65efa06e..a2ecf5a9 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -141,8 +141,8 @@ bool uvg_is_lfnst_allowed(
       for (int i = 0; i < split_num; ++i) {
         cu_loc_t split_loc;
         uvg_get_isp_split_loc(&split_loc, cu_loc->x, cu_loc->y, cu_width, cu_height, i, isp_mode, false);
-        int local_split_x = split_loc.x;
-        int local_split_y = split_loc.y;
+        int local_split_x = lcu ? split_loc.local_x : split_loc.x;
+        int local_split_y = lcu ? split_loc.local_y : split_loc.y;
         uvg_get_isp_cu_arr_coords(&local_split_x, &local_split_y);
         const cu_info_t* split_cu = lcu ? LCU_GET_CU_AT_PX(lcu, local_split_x, local_split_y) :
           uvg_cu_array_at_const(frame->cu_array, local_split_x, local_split_y);

From 5aa13ad62a7242c9875f6ee4215aa18cf63c7b1b Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 14 Dec 2022 11:26:01 +0200
Subject: [PATCH 150/254] [tests] add tests for mtt

---
 CMakeLists.txt            |  3 +++
 tests/test_cabac_state.sh |  4 ++--
 tests/test_mtt.sh         | 14 ++++++++++++++
 3 files changed, 19 insertions(+), 2 deletions(-)
 create mode 100644 tests/test_mtt.sh

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ab0b63a6..d8c37bbc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -340,6 +340,9 @@ if(NOT DEFINED MSVC)
   if(NOT "test_external_symbols" IN_LIST XFAIL)
     add_test( NAME test_external_symbols COMMAND ${PROJECT_SOURCE_DIR}/tests/test_external_symbols.sh WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/tests)
   endif()
+  if(NOT "test_mtt" IN_LIST XFAIL)
+    add_test( NAME test_mtt COMMAND ${PROJECT_SOURCE_DIR}/tests/test_mtt.sh WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/tests)
+  endif()
   if(NOT "test_intra" IN_LIST XFAIL)
     add_test( NAME test_intra COMMAND ${PROJECT_SOURCE_DIR}/tests/test_intra.sh WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/tests)
   endif()
diff --git a/tests/test_cabac_state.sh b/tests/test_cabac_state.sh
index e60806dc..865d9018 100755
--- a/tests/test_cabac_state.sh
+++ b/tests/test_cabac_state.sh
@@ -6,10 +6,10 @@ set -eu
 
 cabacfile="$(mktemp)"
 
-valgrind_test 256x128 10 yuv420p --preset veryslow --pu-depth-intra 0-4 --cclm --rd 3 --mip --jccr --mrl --lfnst -p 1 --owf 0 --no-wpp --cabac-debug-file="${cabacfile}"
+valgrind_test 256x128 10 yuv420p --preset veryslow --pu-depth-intra 0-8 --mtt-depth-intra 3 --mtt-depth-intra-chroma 3 --cclm --rd 3 --mip --jccr --mrl --lfnst -p 1 --owf 0 --no-wpp --cabac-debug-file="${cabacfile}"
 python3 check_cabac_state_consistency.py "${cabacfile}"
 
-valgrind_test 256x128 10 yuv420p --preset veryslow --pu-depth-intra 0-4 --cclm --rd 3 --mip --jccr --mrl --lfnst --dual-tree -p 1 --owf 0 --no-wpp --cabac-debug-file="${cabacfile}"
+valgrind_test 256x128 10 yuv420p --preset veryslow --pu-depth-intra 0-8 --mtt-depth-intra 3 --mtt-depth-intra-chroma 3 --cclm --rd 3 --mip --jccr --mrl --lfnst --dual-tree -p 1 --owf 0 --no-wpp --cabac-debug-file="${cabacfile}"
 python3 check_cabac_state_consistency.py "${cabacfile}"
 
 rm -rf "${cabacfile}"
diff --git a/tests/test_mtt.sh b/tests/test_mtt.sh
new file mode 100644
index 00000000..5fc5587b
--- /dev/null
+++ b/tests/test_mtt.sh
@@ -0,0 +1,14 @@
+#!/bin/sh
+
+# Test all-intra coding.
+
+set -eu
+
+. "${0%/*}/util.sh"
+
+common_args='264x130 10 yuv420p -p1 --preset=ultrafast --threads=0 --no-cpuid --no-wpp --fast-residual-cost 0'
+valgrind_test $common_args --rd=0 --mtt-depth-intra 1 --pu-depth-intra 2-3
+valgrind_test $common_args --rd=3 --mtt-depth-intra 1 --pu-depth-intra 0-5
+valgrind_test $common_args --rd=3 --mtt-depth-intra 3 --pu-depth-intra 0-8
+valgrind_test $common_args --rd=3 --mtt-depth-intra 3 --mtt-depth-intra-chroma 3 --dual-tree --pu-depth-intra 0-8
+valgrind_test $common_args --rd=3 --rdoq --jccr --isp --lfnst --mip --mrl --mts intra --cclm --mtt-depth-intra 3 --mtt-depth-intra-chroma 3 --dual-tree --pu-depth-intra 0-8

From 972670a1773028d29a71b47eb618b8d3963c3ab3 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 14 Dec 2022 11:35:44 +0200
Subject: [PATCH 151/254] [tests] chmod +x

---
 tests/test_mtt.sh | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 tests/test_mtt.sh

diff --git a/tests/test_mtt.sh b/tests/test_mtt.sh
old mode 100644
new mode 100755

From 74591cd39bc0c1a64c2ba6984780f13d7cc808fa Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 14 Dec 2022 11:38:30 +0200
Subject: [PATCH 152/254] [tests] no-cpuid for test_cabac_state.sh since it now
 has mtt in it

---
 tests/test_cabac_state.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_cabac_state.sh b/tests/test_cabac_state.sh
index 865d9018..6d60d1da 100755
--- a/tests/test_cabac_state.sh
+++ b/tests/test_cabac_state.sh
@@ -6,10 +6,10 @@ set -eu
 
 cabacfile="$(mktemp)"
 
-valgrind_test 256x128 10 yuv420p --preset veryslow --pu-depth-intra 0-8 --mtt-depth-intra 3 --mtt-depth-intra-chroma 3 --cclm --rd 3 --mip --jccr --mrl --lfnst -p 1 --owf 0 --no-wpp --cabac-debug-file="${cabacfile}"
+valgrind_test 256x128 10 yuv420p --no-cpuid --preset veryslow --pu-depth-intra 0-8 --mtt-depth-intra 3 --mtt-depth-intra-chroma 3 --cclm --rd 3 --mip --jccr --mrl --lfnst -p 1 --owf 0 --no-wpp --cabac-debug-file="${cabacfile}"
 python3 check_cabac_state_consistency.py "${cabacfile}"
 
-valgrind_test 256x128 10 yuv420p --preset veryslow --pu-depth-intra 0-8 --mtt-depth-intra 3 --mtt-depth-intra-chroma 3 --cclm --rd 3 --mip --jccr --mrl --lfnst --dual-tree -p 1 --owf 0 --no-wpp --cabac-debug-file="${cabacfile}"
+valgrind_test 256x128 10 yuv420p --no-cpuid --preset veryslow --pu-depth-intra 0-8 --mtt-depth-intra 3 --mtt-depth-intra-chroma 3 --cclm --rd 3 --mip --jccr --mrl --lfnst --dual-tree -p 1 --owf 0 --no-wpp --cabac-debug-file="${cabacfile}"
 python3 check_cabac_state_consistency.py "${cabacfile}"
 
 rm -rf "${cabacfile}"

From ffe17e48d7694a5ef617c3459fc71e5047cadf47 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 14 Dec 2022 12:02:00 +0200
Subject: [PATCH 153/254] [mtt] minor fixes

---
 src/cu.c     | 2 +-
 src/search.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/cu.c b/src/cu.c
index df89642b..6b4f483d 100644
--- a/src/cu.c
+++ b/src/cu.c
@@ -413,7 +413,7 @@ int uvg_get_possible_splits(const encoder_state_t * const state,
   splits[NO_SPLIT] = splits[QT_SPLIT] = splits[BT_HOR_SPLIT] = splits[TT_HOR_SPLIT] = splits[BT_VER_SPLIT] = splits[TT_VER_SPLIT] = true;
   bool can_btt = split_tree.mtt_depth < max_btd;
   
-  const enum split_type last_split = GET_SPLITDATA(&split_tree, 0);
+  const enum split_type last_split = GET_SPLITDATA(&split_tree, split_tree.current_depth - 1);
   const enum split_type parl_split = last_split == TT_HOR_SPLIT ? BT_HOR_SPLIT : BT_VER_SPLIT;
 
   // don't allow QT-splitting below a BT split
diff --git a/src/search.c b/src/search.c
index b8040666..5a8ef639 100644
--- a/src/search.c
+++ b/src/search.c
@@ -1572,7 +1572,7 @@ static double search_cu(
     default:
       assert(0 && "Incorrect_slice_type");
   }
-  if(minimum_split_amount > max_btd && !is_implicit) {
+  if(minimum_split_amount > max_btd && !is_implicit && can_split[1]) {
     // If search should not be performed at depths that cannot be reached after a maximum mtt split amount
     // we are in trouble, therefore prevent mtt splits in such situation
     can_split[2] = can_split[3] = can_split[4] = can_split[5] = false;

From 1493a2616c88bc48538d38609e8df16c0975e4f1 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 14 Dec 2022 12:33:24 +0200
Subject: [PATCH 154/254] [mtt] fix getting collocated chroma for edge cus

---
 src/search.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/search.c b/src/search.c
index 5a8ef639..b316107b 100644
--- a/src/search.c
+++ b/src/search.c
@@ -166,6 +166,14 @@ static INLINE void initialize_partial_work_tree(lcu_t* from, lcu_t *to, const cu
     for (int y = y_start; y < y_limit; y += SCU_WIDTH) {
       *LCU_GET_CU_AT_PX(to, x_start, y) = *LCU_GET_CU_AT_PX(from, x_start, y);
     }
+
+    for(int y = chroma_loc->local_y; y < chroma_loc->local_y + chroma_loc->height; y += SCU_WIDTH) {
+      for (int x = chroma_loc->local_x; x < chroma_loc->local_x + chroma_loc->width; x += SCU_WIDTH) {
+        if(x >= cu_loc->local_x && y>= cu_loc->local_y) continue;
+        *LCU_GET_CU_AT_PX(to, x, y) = *LCU_GET_CU_AT_PX(from, x, y);
+      }      
+    }
+
     if (chroma_loc->local_x == 0) {
       to->left_ref = from->left_ref;
       *LCU_GET_TOP_RIGHT_CU(to) = *LCU_GET_TOP_RIGHT_CU(from);      

From 1333ab55d94ab32f03052879f9c2eb612240ed63 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 14 Dec 2022 13:12:03 +0200
Subject: [PATCH 155/254] [mtt] Fix ref building for 32x64 cus

---
 src/cu.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/cu.c b/src/cu.c
index 6b4f483d..6ed2f1b7 100644
--- a/src/cu.c
+++ b/src/cu.c
@@ -498,7 +498,8 @@ int uvg_count_available_edge_cus(const cu_loc_t* const cu_loc, const lcu_t* cons
 
   int amount = 0;
   if(left) {
-    if (cu_loc->local_y == 0 && cu_loc->local_x == 32 && cu_loc->height == 32 && cu_loc->width == 32) return 8;
+    const cu_info_t* cu = LCU_GET_CU_AT_PX(lcu, cu_loc->local_x, cu_loc->local_y);
+    if (cu_loc->local_y == 0 && cu_loc->local_x == 32 && cu->log2_height == 6 && cu->log2_width == 6) return 8;
     while (cu_loc->local_y + amount < LCU_WIDTH && LCU_GET_CU_AT_PX(lcu, cu_loc->local_x - TR_MIN_WIDTH, cu_loc->local_y + amount)->type != CU_NOTSET) {
       amount += TR_MIN_WIDTH;
     }

From 34aed10ec1f949d7c78f7fc9213050ab42b67438 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Thu, 15 Dec 2022 09:09:38 +0200
Subject: [PATCH 156/254] [mtt] fix

---
 src/search.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/search.c b/src/search.c
index b316107b..747182d7 100644
--- a/src/search.c
+++ b/src/search.c
@@ -166,6 +166,9 @@ static INLINE void initialize_partial_work_tree(lcu_t* from, lcu_t *to, const cu
     for (int y = y_start; y < y_limit; y += SCU_WIDTH) {
       *LCU_GET_CU_AT_PX(to, x_start, y) = *LCU_GET_CU_AT_PX(from, x_start, y);
     }
+    for (int x = x_start; x < y_limit; x += SCU_WIDTH) {
+      *LCU_GET_CU_AT_PX(to, x, y_start) = *LCU_GET_CU_AT_PX(from, x, y_start);
+    }
 
     for(int y = chroma_loc->local_y; y < chroma_loc->local_y + chroma_loc->height; y += SCU_WIDTH) {
       for (int x = chroma_loc->local_x; x < chroma_loc->local_x + chroma_loc->width; x += SCU_WIDTH) {

From 8e4b864e6bacd8420118209bdd82575c3796120b Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Thu, 15 Dec 2022 10:07:47 +0200
Subject: [PATCH 157/254] [deblock] Fix incorrect direction for transform split
 of tall blocks at the top CTU row also for chroma

---
 src/search.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/search.c b/src/search.c
index 747182d7..8ebc9280 100644
--- a/src/search.c
+++ b/src/search.c
@@ -1094,7 +1094,7 @@ static void mark_deblocking(const cu_loc_t* const cu_loc, const cu_loc_t* const
       }
       else if (cu_loc->height == 64) {
         for (int x = chroma_loc->local_x; x < chroma_loc->local_x + chroma_loc->width; x += SCU_WIDTH) {
-          LCU_GET_CU_AT_PX(lcu, x, TR_MAX_WIDTH)->chroma_deblocking |= EDGE_VER;
+          LCU_GET_CU_AT_PX(lcu, x, TR_MAX_WIDTH)->chroma_deblocking |= EDGE_HOR;
         }
       }
     }
@@ -1123,7 +1123,7 @@ static void mark_deblocking(const cu_loc_t* const cu_loc, const cu_loc_t* const
     }
     else if (chroma_loc->height == 64) {
       for (int x = x_local; x < x_local + chroma_loc->chroma_width; x += SCU_WIDTH) {
-        LCU_GET_CU_AT_PX(lcu, x, TR_MAX_WIDTH / 2)->chroma_deblocking |= EDGE_VER;
+        LCU_GET_CU_AT_PX(lcu, x, TR_MAX_WIDTH / 2)->chroma_deblocking |= EDGE_HOR;
       }
     }
   }

From 926ed7e14571a0eea9ed6f93b7ddf303666ef6c3 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Thu, 15 Dec 2022 11:25:12 +0200
Subject: [PATCH 158/254] [rdoq] partly fix rdoq for 16x1 and 1x16

---
 src/rdo.c    | 2 +-
 src/tables.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/rdo.c b/src/rdo.c
index 6122269d..67605c4e 100644
--- a/src/rdo.c
+++ b/src/rdo.c
@@ -1461,7 +1461,7 @@ void uvg_rdoq(
 
   const uint32_t cg_size = 16;
   const int32_t  shift = 4 >> 1;
-  const uint32_t num_blk_side = width >> shift;
+  const uint32_t num_blk_side = MAX(width >> shift, 1);
   double   cost_coeffgroup_sig[ 64 ];
   uint32_t sig_coeffgroup_flag[ 64 ];
 
diff --git a/src/tables.c b/src/tables.c
index dec6f020..c98ecf79 100644
--- a/src/tables.c
+++ b/src/tables.c
@@ -2615,7 +2615,7 @@ const uint32_t* const uvg_get_scan_order_table(int scan_group, int scan_type, in
     return g_scan_order[scan_group][log2_w][log2_h];
   }
   else {
-    if (log2_w == 1 || log2_h == 1) {
+    if (log2_w <= 1 || log2_h <= 1) {
       // Just return array containing [0, 15] in order
       return g_scan_order[scan_group][0][4];
     }

From d296cac7c37078549b348c1f52fa6f2cdddc8e7c Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Thu, 15 Dec 2022 12:41:12 +0200
Subject: [PATCH 159/254] [mtt] fix reference building for 16x1

---
 src/intra.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/intra.c b/src/intra.c
index 4b6d056a..15beafbf 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -1495,8 +1495,7 @@ void uvg_intra_build_reference_inner(
   if (px.y % 4 != 0) {
     do {
       out_left_ref[i + 1] = left_border[(i + 0 - multi_ref_index) * left_stride];
-      out_left_ref[i + 2] = left_border[(i + 1 - multi_ref_index) * left_stride];
-      i += 2;
+      i += 1;
     } while (i < px_available_left);
   }
   else {

From c89ebf8bf10bf901078db95255d0c728e7a6310b Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Thu, 15 Dec 2022 13:12:42 +0200
Subject: [PATCH 160/254] [cclm] Fix heap corruption for non 64 divisible
 frames

---
 src/search.c     | 2 +-
 src/videoframe.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/search.c b/src/search.c
index 8ebc9280..dee9bbbe 100644
--- a/src/search.c
+++ b/src/search.c
@@ -450,7 +450,7 @@ static void downsample_cclm_rec(encoder_state_t *state, int x, int y, int width,
   if((y + height * 2) % 64 == 0) {
     int line = y / 64 * stride2 / 2;
     y_rec -= LCU_WIDTH;
-    for (int i = 0; i < width; ++i) {
+    for (int i = 0; i < width && i + x < stride2 / 2; ++i) {
       int s = 2;
       s += y_rec[i * 2] * 2;
       s += y_rec[i * 2 + 1];
diff --git a/src/videoframe.c b/src/videoframe.c
index f5a4d8af..e9a43dc1 100644
--- a/src/videoframe.c
+++ b/src/videoframe.c
@@ -61,7 +61,7 @@ videoframe_t * uvg_videoframe_alloc(int32_t width,
     frame->sao_chroma = MALLOC(sao_info_t, frame->width_in_lcu * frame->height_in_lcu);
     if (cclm) {
       assert(chroma_format == UVG_CSP_420);
-      frame->cclm_luma_rec = MALLOC(uvg_pixel, (((width + 7) & ~7) + FRAME_PADDING_LUMA) * (((height + 7) & ~7) + FRAME_PADDING_LUMA) / 4);
+      frame->cclm_luma_rec = MALLOC(uvg_pixel, (((width + 7) & ~7) + FRAME_PADDING_LUMA) * (((height + 15) & ~7) + FRAME_PADDING_LUMA) / 4);
       frame->cclm_luma_rec_top_line = MALLOC(uvg_pixel, (((width + 7) & ~7) + FRAME_PADDING_LUMA) / 2 * CEILDIV(height, 64));
     }
   }

From ad2bb20f230b8e03f4a5b21841735f7b471b914f Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Fri, 16 Dec 2022 09:37:56 +0200
Subject: [PATCH 161/254] [mtt] Fix deblock for isp and properly set the limit
 for cclm

---
 src/encode_coding_tree.c |  2 +-
 src/search.c             | 27 ++++++++++++++++++++++++++-
 2 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index a2ecf5a9..94d39f2d 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -1439,7 +1439,7 @@ void uvg_encode_coding_tree(
   
   DBG_YUVIEW_VALUE(state->frame->poc, DBG_YUVIEW_CU_TYPE, abs_x, abs_y, cu_width, cu_height, cur_cu->type-1);
 
-  // fprintf(stderr, "%4d %4d %2d %2d %d %d\n", x, y, cu_width, cu_height, has_chroma, cur_cu->split_tree);
+  // fprintf(stderr, "%4d %4d %2d %2d %d %d %d\n", x, y, cu_width, cu_height, has_chroma, tree_type, cur_cu->split_tree);
 
   if (ctrl->cfg.lossless) {
     cabac->cur_ctx = &cabac->ctx.cu_transquant_bypass;
diff --git a/src/search.c b/src/search.c
index dee9bbbe..7efdb004 100644
--- a/src/search.c
+++ b/src/search.c
@@ -450,7 +450,7 @@ static void downsample_cclm_rec(encoder_state_t *state, int x, int y, int width,
   if((y + height * 2) % 64 == 0) {
     int line = y / 64 * stride2 / 2;
     y_rec -= LCU_WIDTH;
-    for (int i = 0; i < width && i + x < stride2 / 2; ++i) {
+    for (int i = 0; i < width && i + x / 2 < stride2 / 2; ++i) {
       int s = 2;
       s += y_rec[i * 2] * 2;
       s += y_rec[i * 2 + 1];
@@ -1555,6 +1555,31 @@ static double search_cu(
       is_separate_tree,
       x_local,
       y_local);
+    if (cur_cu->type == CU_INTRA && cur_cu->intra.isp_mode != ISP_MODE_NO_ISP && tree_type != UVG_CHROMA_T) {
+      const int split_num = uvg_get_isp_split_num( cu_width, cu_height, cur_cu->intra.isp_mode,true);
+      for (int i = 1; i < split_num; i++) {
+        cu_loc_t isp_loc;
+        uvg_get_isp_split_loc(
+          &isp_loc,
+          x,
+          y,
+          cu_width,
+          cu_height,
+          i,
+          cur_cu->intra.isp_mode,
+          true);
+        if (x % 4 || y % 4) continue;
+        mark_deblocking(
+          &isp_loc,
+          chroma_loc,
+          lcu,
+          UVG_LUMA_T,
+          false,
+          false,
+          isp_loc.local_x,
+          isp_loc.local_y);
+      }
+    }
   } 
 
   bool can_split_cu =

From 446c53fd00acd4aaedabc112f6083855e9baf517 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Fri, 16 Dec 2022 14:13:45 +0200
Subject: [PATCH 162/254] [mtt] Fix cclm for non 64 divisible heights

---
 src/search.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/search.c b/src/search.c
index 7efdb004..c0dc0700 100644
--- a/src/search.c
+++ b/src/search.c
@@ -430,7 +430,7 @@ static void downsample_cclm_rec(encoder_state_t *state, int x, int y, int width,
   const int stride = state->tile->frame->rec->stride;
   const int stride2 = (((state->tile->frame->width + 7) & ~7) + FRAME_PADDING_LUMA);
 
-  for (int y_ = 0; y_ < height && y_ * 2 + y < state->encoder_control->cfg.height; y_++) {
+  for (int y_ = 0; y_ < height && y_ * 2 + y < state->tile->frame->height; y_++) {
     for (int x_ = 0; x_ < width; x_++) {
       int s = 4;
       s += y_rec[2 * x_] * 2;

From 4e203108bca022cb9110e4ef3c6058855164e6c3 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Mon, 19 Dec 2022 09:35:32 +0200
Subject: [PATCH 163/254] [mtt] Fix ref pixel generation for the second half of
 32x2 chroma cus

---
 src/intra.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/intra.c b/src/intra.c
index 15beafbf..0217ed7c 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -1492,7 +1492,7 @@ void uvg_intra_build_reference_inner(
   int i = multi_ref_index;  // Offset by multi_ref_index
   
   // Do different loop for heights smaller than 4 (possible for some ISP splits)
-  if (px.y % 4 != 0) {
+  if (px.y % 4 != 0 || px_available_left < 4) {
     do {
       out_left_ref[i + 1] = left_border[(i + 0 - multi_ref_index) * left_stride];
       i += 1;

From 27d114bc082358ca0bab69f17f242a9363292fbc Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Mon, 19 Dec 2022 09:59:33 +0200
Subject: [PATCH 164/254] [mtt] Fix negative indexing

---
 src/strategies/generic/dct-generic.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/strategies/generic/dct-generic.c b/src/strategies/generic/dct-generic.c
index fec783b6..2a673d21 100644
--- a/src/strategies/generic/dct-generic.c
+++ b/src/strategies/generic/dct-generic.c
@@ -2599,8 +2599,8 @@ static void mts_dct_generic(
       }
     }
 
-    partial_tr_func* dct_hor = dct_table[type_hor][log2_width_minus1];
-    partial_tr_func* dct_ver = dct_table[type_ver][log2_height_minus1];
+    partial_tr_func* dct_hor = width != 1 ? dct_table[type_hor][log2_width_minus1] : NULL;
+    partial_tr_func* dct_ver = height != 1 ? dct_table[type_ver][log2_height_minus1] : NULL;
 
     int16_t tmp[32 * 32];
     const int32_t shift_1st = log2_width_minus1 + bitdepth - 8;
@@ -2655,8 +2655,8 @@ static void mts_idct_generic(
       }
     }
 
-    partial_tr_func* idct_hor = idct_table[type_hor][log2_width_minus1];
-    partial_tr_func* idct_ver = idct_table[type_ver][log2_height_minus1];
+    partial_tr_func* idct_hor = width != 1 ? idct_table[type_hor][log2_width_minus1] : NULL;
+    partial_tr_func* idct_ver = height != 1 ? idct_table[type_ver][log2_height_minus1] : NULL;
 
     int16_t tmp[32 * 32];
     const int max_log2_tr_dynamic_range = 15;

From 812377db45d66857d218d3b29a1170305afee0a4 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Mon, 19 Dec 2022 10:24:00 +0200
Subject: [PATCH 165/254] [mtt] Set cus outside of the frame to zero for
 initializing partial worktree

---
 src/search.c | 27 +++++++++++++++++++++------
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/src/search.c b/src/search.c
index c0dc0700..10b2ae35 100644
--- a/src/search.c
+++ b/src/search.c
@@ -77,12 +77,17 @@ static INLINE void copy_cu_info(lcu_t *from, lcu_t *to, const cu_loc_t* const cu
 }
 
 
-static INLINE void initialize_partial_work_tree(lcu_t* from, lcu_t *to, const cu_loc_t * const cu_loc, const cu_loc_t* const
-                                                chroma_loc,
-                                                const enum uvg_tree_type tree_type) {
+static INLINE void initialize_partial_work_tree(
+  const encoder_state_t* const state,
+  lcu_t* from,
+  lcu_t *to,
+  const cu_loc_t * const cu_loc,
+  const cu_loc_t* const
+  chroma_loc,
+  const enum uvg_tree_type tree_type) {
 
-  const int y_limit = LCU_WIDTH >> (tree_type == UVG_CHROMA_T);
-  const int x_limit = LCU_WIDTH >> (tree_type == UVG_CHROMA_T);
+  const int y_limit = MIN(LCU_WIDTH,  state->tile->frame->height - cu_loc->y / 64 * 64) >> (tree_type == UVG_CHROMA_T);
+  const int x_limit = MIN(LCU_WIDTH, state->tile->frame->width - cu_loc->x / 64 * 64) >> (tree_type == UVG_CHROMA_T);
 
   if (cu_loc->local_x == 0) {
     to->left_ref = from->left_ref;
@@ -186,6 +191,16 @@ static INLINE void initialize_partial_work_tree(lcu_t* from, lcu_t *to, const cu
       *LCU_GET_TOP_RIGHT_CU(to) = *LCU_GET_TOP_RIGHT_CU(from);      
     }
   }
+  if (x_limit != LCU_WIDTH >> (tree_type == UVG_CHROMA_T)) {
+    for (int y = y_start; y < y_limit; y += SCU_WIDTH) {
+      memset(LCU_GET_CU_AT_PX(to, x_limit, y), 0, sizeof(cu_info_t));
+    }
+  }
+  if (y_limit != LCU_WIDTH >> (tree_type == UVG_CHROMA_T)) {
+    for (int x = x_start; x < x_limit; x += SCU_WIDTH) {
+      memset(LCU_GET_CU_AT_PX(to, x, y_limit), 0, sizeof(cu_info_t));
+    }
+  }
 }
 
 static INLINE void copy_cu_pixels(
@@ -1694,7 +1709,7 @@ static double search_cu(
       uint8_t separate_chroma = 0;
       const int splits = uvg_get_split_locs(cu_loc, split_type, new_cu_loc, &separate_chroma);
       separate_chroma |= !has_chroma;
-      initialize_partial_work_tree(lcu, &split_lcu[split_type - 1], cu_loc, separate_chroma ? chroma_loc : cu_loc , tree_type);
+      initialize_partial_work_tree(state, lcu, &split_lcu[split_type - 1], cu_loc , separate_chroma ? chroma_loc : cu_loc, tree_type);
       for (int split = 0; split < splits; ++split) {
         new_split.part_index = split;
         split_cost += search_cu(state, 

From 9acdab3209f1e49157c51df194e1087e5d95129d Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Mon, 19 Dec 2022 13:22:10 +0200
Subject: [PATCH 166/254] [mtt] Fix lfnst bit counting for 64 wide or tall
 chroma tree cus

---
 src/encode_coding_tree.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index 94d39f2d..31bbe003 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -115,8 +115,8 @@ bool uvg_is_lfnst_allowed(
 {
   if (state->encoder_control->cfg.lfnst && pred_cu->type == CU_INTRA && PU_IS_TU(pred_cu)) {
     const int isp_mode = pred_cu->intra.isp_mode;
-    const int cu_width = tree_type != UVG_CHROMA_T ? cu_loc->width : cu_loc->chroma_width;
-    const int cu_height = tree_type != UVG_CHROMA_T ? cu_loc->height : cu_loc->chroma_height;
+    const int cu_width  = tree_type != UVG_CHROMA_T ? 1 << pred_cu->log2_width : 1 << pred_cu->log2_chroma_width;
+    const int cu_height = tree_type != UVG_CHROMA_T ? 1 << pred_cu->log2_height : 1 << pred_cu->log2_chroma_height;
     bool can_use_lfnst_with_mip = (cu_width >= 16 && cu_height >= 16);
     bool is_sep_tree = tree_type != UVG_BOTH_T;
     bool mip_flag = pred_cu->type == CU_INTRA && color == COLOR_Y ? pred_cu->intra.mip_flag : false;

From af23c81afa5c502a45e2af204d98080af96ba6d2 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Mon, 19 Dec 2022 14:25:03 +0200
Subject: [PATCH 167/254] [mtt] Fix reading uninitialized data for local chroma
 tree

---
 src/search.c | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/src/search.c b/src/search.c
index 10b2ae35..c1e1a153 100644
--- a/src/search.c
+++ b/src/search.c
@@ -190,15 +190,27 @@ static INLINE void initialize_partial_work_tree(
       to->top_ref = from->top_ref;
       *LCU_GET_TOP_RIGHT_CU(to) = *LCU_GET_TOP_RIGHT_CU(from);      
     }
-  }
-  if (x_limit != LCU_WIDTH >> (tree_type == UVG_CHROMA_T)) {
-    for (int y = y_start; y < y_limit; y += SCU_WIDTH) {
-      memset(LCU_GET_CU_AT_PX(to, x_limit, y), 0, sizeof(cu_info_t));
+    if (x_limit != LCU_WIDTH >> (tree_type == UVG_CHROMA_T)) {
+      for (int y = y_start; y < y_limit; y += SCU_WIDTH) {
+        memset(LCU_GET_CU_AT_PX(to, x_limit, y), 0, sizeof(cu_info_t));
+      }
+    }
+    if (y_limit != LCU_WIDTH >> (tree_type == UVG_CHROMA_T)) {
+      for (int x = x_start; x < x_limit; x += SCU_WIDTH) {
+        memset(LCU_GET_CU_AT_PX(to, x, y_limit), 0, sizeof(cu_info_t));
+      }
     }
   }
-  if (y_limit != LCU_WIDTH >> (tree_type == UVG_CHROMA_T)) {
-    for (int x = x_start; x < x_limit; x += SCU_WIDTH) {
-      memset(LCU_GET_CU_AT_PX(to, x, y_limit), 0, sizeof(cu_info_t));
+  else {
+    if (x_limit != LCU_WIDTH >> (tree_type == UVG_CHROMA_T)) {
+      for (int y = y_start; y < y_limit; y += SCU_WIDTH) {
+        memset(LCU_GET_CU_AT_PX(to, x_limit, y), 0, sizeof(cu_info_t));
+      }
+    }
+    if (y_limit != LCU_WIDTH >> (tree_type == UVG_CHROMA_T)) {
+      for (int x = x_start; x < x_limit; x += SCU_WIDTH) {
+        memset(LCU_GET_CU_AT_PX(to, x, y_limit), 0, sizeof(cu_info_t));
+      }
     }
   }
 }

From a36a1fb5fff8e5c10a812a48470953b2eb308344 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Mon, 19 Dec 2022 14:42:45 +0200
Subject: [PATCH 168/254] [mtt] There is always at least the height or width
 amount reference pixels available

---
 src/cu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cu.c b/src/cu.c
index 6ed2f1b7..147875fb 100644
--- a/src/cu.c
+++ b/src/cu.c
@@ -496,7 +496,7 @@ int uvg_count_available_edge_cus(const cu_loc_t* const cu_loc, const lcu_t* cons
   if (left && cu_loc->local_x == 0) return (LCU_WIDTH - cu_loc->local_y) / 4;
   if (!left && cu_loc->local_y == 0) return (cu_loc->width) / 2;
 
-  int amount = 0;
+  int amount = left ? cu_loc->height & ~3 : cu_loc->width & ~3;
   if(left) {
     const cu_info_t* cu = LCU_GET_CU_AT_PX(lcu, cu_loc->local_x, cu_loc->local_y);
     if (cu_loc->local_y == 0 && cu_loc->local_x == 32 && cu->log2_height == 6 && cu->log2_width == 6) return 8;

From f3c8a4f5dbdfb6e743548d97fafdadb5036f18a7 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Tue, 20 Dec 2022 08:23:01 +0200
Subject: [PATCH 169/254] [lfnst] Also chroma can only use lfnst if dimensions
 are minimum 4

---
 src/search_intra.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/search_intra.c b/src/search_intra.c
index 0fe33d58..b4442f1a 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -1445,7 +1445,7 @@ int8_t uvg_search_intra_chroma_rdo(
     const int offset = ((cu_loc->local_x) >> 1) + ((cu_loc->local_y) >> 1)* LCU_WIDTH_C;
 
     int lfnst_modes_to_check[3];
-    if((is_separate || tree_type == UVG_CHROMA_T) && state->encoder_control->cfg.lfnst && PU_IS_TU(&chroma_data->pred_cu) ) {
+    if((is_separate || tree_type == UVG_CHROMA_T) && state->encoder_control->cfg.lfnst && PU_IS_TU(&chroma_data->pred_cu) && chroma_height >= 4 && chroma_width >= 4) {
       for (int i = 0; i < 3; ++i) {
         lfnst_modes_to_check[i] = i;
       }

From 73956a9a46fd0f349022e3afc927ece36a48dfda Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Tue, 20 Dec 2022 09:13:08 +0200
Subject: [PATCH 170/254] [isp] Fix isp bitcost calculation

---
 src/search.c | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/src/search.c b/src/search.c
index c1e1a153..36d3334a 100644
--- a/src/search.c
+++ b/src/search.c
@@ -538,8 +538,9 @@ double uvg_cu_rd_cost_luma(
     return sum + tr_tree_bits * state->lambda;
   }
 
+  const bool is_not_isp = pred_cu->type == CU_INTER || pred_cu->intra.isp_mode == ISP_MODE_NO_ISP;
   // Add transform_tree cbf_luma bit cost.
-  if (pred_cu->type == CU_INTER || pred_cu->intra.isp_mode == ISP_MODE_NO_ISP) {
+  if (is_not_isp) {
     const int depth = 6 - uvg_g_convert_to_log2[cu_loc->width];
     int is_set = cbf_is_set(pred_cu->cbf, COLOR_Y);
     if (pred_cu->type == CU_INTRA ||
@@ -562,8 +563,9 @@ double uvg_cu_rd_cost_luma(
     // TODO: 8x4 CUs
     const int split_limit = uvg_get_isp_split_num(cu_loc->width, cu_loc->height, pred_cu->intra.isp_mode, true);
     int luma_ctx = 2;
+    const int split_limit_minus_one = split_limit - 1;
     for (int i = 0; i < split_limit; i++) {
-      if (i != 3 && isp_cbf != 0x8) {
+      if (i != split_limit_minus_one || isp_cbf != 1 << split_limit_minus_one) {
         const int flag = (isp_cbf >> i) & 1;
         CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_cbf_model_luma[luma_ctx]), flag, tr_tree_bits, "cbf_y_search");
         luma_ctx = 2 + flag;
@@ -583,7 +585,7 @@ double uvg_cu_rd_cost_luma(
 
   if (!skip_residual_coding) {
     int8_t luma_scan_mode = SCAN_DIAG;
-    if (pred_cu->type == CU_INTER || pred_cu->intra.isp_mode == ISP_MODE_NO_ISP) {
+    if (is_not_isp) {
       //const coeff_t* coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, x_px, y_px)];
       const coeff_t* coeffs = lcu->coeff.y;
 
@@ -795,9 +797,10 @@ static double cu_rd_cost_tr_split_accurate(
   else {
     // TODO: 8x4 CUs
     const int split_limit = uvg_get_isp_split_num(width, height, pred_cu->intra.isp_mode, true);
+    int luma_ctx = 2;
+    const int split_limit_minus_one = split_limit - 1;
     for (int i = 0; i < split_limit; i++) {
-      int luma_ctx = 2;
-      if (i != 3 && isp_cbf != 0x8) {
+      if (i != split_limit_minus_one || isp_cbf != 1 << split_limit_minus_one) {
         const int flag = (isp_cbf >> i) & 1;
         CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_cbf_model_luma[luma_ctx]), flag, tr_tree_bits, "cbf_y_search");
         luma_ctx = 2 + flag;
@@ -829,7 +832,7 @@ static double cu_rd_cost_tr_split_accurate(
                                && height <= (1 << state->encoder_control->cfg.trskip_max_size)
                                && !is_isp;
 
-  if(cb_flag_y){
+  if(cb_flag_y || is_isp){
     if (can_use_tr_skip) {
       CABAC_FBITS_UPDATE(cabac, &cabac->ctx.transform_skip_model_luma, tr_cu->tr_idx == MTS_SKIP, tr_tree_bits, "transform_skip_flag");
     }
@@ -936,7 +939,7 @@ static double cu_rd_cost_tr_split_accurate(
   }
 
   const bool is_chroma_tree = is_local_sep_tree || tree_type == UVG_CHROMA_T;
-  if (uvg_is_lfnst_allowed(state, tr_cu, is_local_sep_tree ? UVG_CHROMA_T : tree_type, is_chroma_tree ? COLOR_UV : COLOR_Y, is_chroma_tree ? chroma_loc : cu_loc, lcu)) {
+  if (uvg_is_lfnst_allowed(state, tr_cu, is_local_sep_tree ? UVG_CHROMA_T : tree_type, is_chroma_tree ? COLOR_UV : COLOR_Y, is_chroma_tree ? chroma_loc : cu_loc, lcu) && tree_type != UVG_LUMA_T) {
     const int lfnst_idx = is_chroma_tree ? tr_cu->cr_lfnst_idx : tr_cu->lfnst_idx;
     CABAC_FBITS_UPDATE(
       cabac,
@@ -1255,6 +1258,8 @@ static double search_cu(
     cur_cu->log2_chroma_width = uvg_g_convert_to_log2[chroma_loc->chroma_width];
   }
 
+  intra_search_data_t intra_search;
+
   // If the CU is completely inside the frame at this depth, search for
   // prediction modes at this depth.
   if ( x + luma_width <= frame_width && y + luma_height <= frame_height)
@@ -1304,7 +1309,6 @@ static double search_cu(
         (y & ~(cu_width_intra_min - 1)) + cu_width_intra_min > frame_height) &&
       !(state->encoder_control->cfg.force_inter && state->frame->slicetype != UVG_SLICE_I);
 
-    intra_search_data_t intra_search;
     intra_search.cost = 0;
     if (can_use_intra && !skip_intra) {
       intra_search.pred_cu = *cur_cu;
@@ -1553,7 +1557,7 @@ static double search_cu(
     
     cost = bits * state->lambda;
 
-    cost += cu_rd_cost_tr_split_accurate(state, cur_cu, lcu, tree_type, 0, cu_loc, chroma_loc, has_chroma);
+    cost += cu_rd_cost_tr_split_accurate(state, cur_cu, lcu, tree_type, intra_search.best_isp_cbfs, cu_loc, chroma_loc, has_chroma);
     //fprintf(stderr, "%4d %4d %2d %2d %d %d %f\n", x, y, cu_width, cu_height, has_chroma, cur_cu->split_tree, cost);
     
     //if (ctrl->cfg.zero_coeff_rdo && inter_zero_coeff_cost <= cost) {

From 3b09c66d25d40527c159237a3bd20d43eb9c9c32 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Tue, 20 Dec 2022 11:25:58 +0200
Subject: [PATCH 171/254] [deblock] Use the isp block dimensions instead of cu
 dimensions fro deblock

---
 src/filter.c | 37 ++++++++++++++++++++++++++++++++++---
 src/search.c |  2 +-
 2 files changed, 35 insertions(+), 4 deletions(-)

diff --git a/src/filter.c b/src/filter.c
index 5605006b..3b41bd44 100644
--- a/src/filter.c
+++ b/src/filter.c
@@ -36,6 +36,7 @@
 
 #include "cu.h"
 #include "encoder.h"
+#include "intra.h"
 #include "uvg266.h"
 #include "transform.h"
 #include "videoframe.h"
@@ -834,10 +835,40 @@ static void filter_deblock_edge_luma(encoder_state_t * const state,
       const int cu_height = 1 << cu_q->log2_height;
       const int pu_size = dir == EDGE_HOR ? cu_height : cu_width;
       const int pu_pos = dir == EDGE_HOR ? y_coord : x_coord;
+      int tu_size_q_side = 0;
+      if (cu_q->type == CU_INTRA && cu_q->intra.isp_mode != ISP_MODE_NO_ISP) {
+        if (cu_q->intra.isp_mode == ISP_MODE_VER && dir == EDGE_VER) {
+          tu_size_q_side = MAX(4, cu_height >> 2);
+        } else if (cu_q->intra.isp_mode == ISP_MODE_HOR && dir == EDGE_HOR) {
+          tu_size_q_side = MAX(4, cu_width >> 2);
+        } else {
+          tu_size_q_side = dir == EDGE_HOR ?
+                             MIN(1 << cu_q->log2_height, TR_MAX_WIDTH) :
+                             MIN(1 << cu_q->log2_width, TR_MAX_WIDTH);
+        }
+      } else {
+        tu_size_q_side = dir == EDGE_HOR ?
+                           MIN(1 << cu_q->log2_height, TR_MAX_WIDTH) :
+                           MIN(1 << cu_q->log2_width, TR_MAX_WIDTH);
+      }
 
-
-      const int tu_size_p_side = dir == EDGE_HOR ? MIN(1 << cu_p->log2_height, TR_MAX_WIDTH) : MIN(1 << cu_p->log2_width, TR_MAX_WIDTH);
-      const int tu_size_q_side = dir == EDGE_HOR ? MIN(1 << cu_q->log2_height, TR_MAX_WIDTH) : MIN(1 << cu_q->log2_width, TR_MAX_WIDTH);
+      int tu_size_p_side = 0;
+      if (cu_p->type == CU_INTRA && cu_p->intra.isp_mode != ISP_MODE_NO_ISP) {
+        if (cu_p->intra.isp_mode == ISP_MODE_VER && dir == EDGE_VER) {
+          tu_size_p_side = MAX(4, (1 << cu_p->log2_height) >> 2);
+        } else if (cu_p->intra.isp_mode == ISP_MODE_HOR && dir == EDGE_HOR) {
+          tu_size_p_side = MAX(4, (1 << cu_p->log2_width) >> 2);
+        } else {
+          tu_size_p_side = dir == EDGE_HOR ?
+                             MIN(1 << cu_p->log2_height, TR_MAX_WIDTH) :
+                             MIN(1 << cu_p->log2_width, TR_MAX_WIDTH);
+        }
+      } else {
+        tu_size_p_side = dir == EDGE_HOR ?
+                           MIN(1 << cu_p->log2_height, TR_MAX_WIDTH) :
+                           MIN(1 << cu_p->log2_width, TR_MAX_WIDTH);
+        
+      }
 
       get_max_filter_length(&max_filter_length_P, &max_filter_length_Q, state, x_coord, y_coord,
                             dir, tu_boundary,
diff --git a/src/search.c b/src/search.c
index 36d3334a..e1e8d971 100644
--- a/src/search.c
+++ b/src/search.c
@@ -1599,7 +1599,7 @@ static double search_cu(
           i,
           cur_cu->intra.isp_mode,
           true);
-        if (x % 4 || y % 4) continue;
+        if (isp_loc.x % 4 || isp_loc.y % 4) continue;
         mark_deblocking(
           &isp_loc,
           chroma_loc,

From c744f79117c781260a1f5e91750bd637f6def062 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 21 Dec 2022 09:17:55 +0200
Subject: [PATCH 172/254] [mtt] Fix rdoq for non-square blocks

---
 src/rdo.c | 36 ++++++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 14 deletions(-)

diff --git a/src/rdo.c b/src/rdo.c
index 67605c4e..93296e0f 100644
--- a/src/rdo.c
+++ b/src/rdo.c
@@ -33,6 +33,7 @@
 #include "rdo.h"
 
 #include <errno.h>
+#include <math.h>
 #include <stdlib.h>
 #include <string.h>
 #include <pthread.h>
@@ -1420,7 +1421,7 @@ void uvg_rdoq(
   bool needs_block_size_trafo_scale = !false && ((log2_block_width + log2_block_height) % 2 == 1);
   needs_block_size_trafo_scale |= 0; // Non log2 block size
 
-  int32_t  transform_shift   = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_block_width + log2_block_height) >> 1) + needs_block_size_trafo_scale;  // Represents scaling through forward transform
+  int32_t  transform_shift   = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_block_width + log2_block_height) >> 1);  // Represents scaling through forward transform
   uint16_t go_rice_param     = 0;
   uint32_t reg_bins = (width * height * 28) >> 4;
   
@@ -1428,7 +1429,7 @@ void uvg_rdoq(
 
   int32_t qp_scaled = uvg_get_scaled_qp(color, state->qp, (encoder->bitdepth - 8) * 6, encoder->qp_map[0]);
   
-  int32_t q_bits = QUANT_SHIFT + qp_scaled/6 + transform_shift;
+  int32_t q_bits = QUANT_SHIFT + qp_scaled/6 + transform_shift - needs_block_size_trafo_scale;
 
   const double lambda = color ? state->c_lambda : state->lambda;
   const int32_t default_quant_coeff = uvg_g_quant_scales[needs_block_size_trafo_scale][qp_scaled % 6];
@@ -1473,7 +1474,14 @@ void uvg_rdoq(
   int32_t cg_last_scanpos = -1;
   int32_t last_scanpos = -1;
 
-  uint32_t cg_num = width * height >> 4;
+  uint32_t       cg_num          = lfnst_idx > 0 ? 1 : width * height >> 4;
+
+  double         dTransShift = (double)transform_shift + (needs_block_size_trafo_scale ? -0.5 : 0.0);
+  // Compensate for scaling of bitcount in Lagrange cost function
+  double scale       = CTX_FRAC_ONE_BIT;
+  // Compensate for scaling through forward transform
+  scale              = scale * pow(2.0, -2.0 * dTransShift);
+  const double  default_error_scale = scale / default_quant_coeff / default_quant_coeff;
 
   // Explicitly tell the only possible numbers of elements to be zeroed.
   // Hope the compiler is able to utilize this information.
@@ -1502,14 +1510,14 @@ void uvg_rdoq(
 
   //Find last cg and last scanpos
   const int max_lfnst_pos = ((height == 4 && width == 4) || (height == 8 && width == 8)) ? 7 : 15;
-  int32_t cg_scanpos;
+  int32_t   cg_scanpos;
+  uint32_t  max_scan_group_size = lfnst_idx > 0 ? max_lfnst_pos : cg_size - 1;
   for (cg_scanpos = (cg_num - 1); cg_scanpos >= 0; cg_scanpos--)
   {
-    for (int32_t scanpos_in_cg = (cg_size - 1); scanpos_in_cg >= 0; scanpos_in_cg--)
+    for (int32_t scanpos_in_cg = max_scan_group_size; scanpos_in_cg >= 0; scanpos_in_cg--)
     {
       int32_t  scanpos        = cg_scanpos*cg_size + scanpos_in_cg;
-
-      if (lfnst_idx > 0 && scanpos > max_lfnst_pos) break;
+      
       uint32_t blkpos         = scan[scanpos];
       int32_t q               = use_scaling_list ? quant_coeff[blkpos] : default_quant_coeff;
       int32_t level_double    = coef[blkpos];
@@ -1518,7 +1526,7 @@ void uvg_rdoq(
 
       double err = (double)level_double;
 
-      cost_coeff0[scanpos] = err * err * err_scale[blkpos];      
+      cost_coeff0[scanpos] = err * err * (use_scaling_list ? err_scale[blkpos] : default_error_scale);      
       
       dest_coeff[blkpos] = max_abs_level;
       if (max_abs_level > 0) {
@@ -1548,21 +1556,21 @@ void uvg_rdoq(
     uint32_t cg_pos_x   = cg_blkpos - (cg_pos_y * num_blk_side);
 
     FILL(rd_stats, 0);
-    for (int32_t scanpos_in_cg = cg_size - 1; scanpos_in_cg >= 0; scanpos_in_cg--)  {
+    for (int32_t scanpos_in_cg = max_scan_group_size; scanpos_in_cg >= 0; scanpos_in_cg--)  {
       int32_t  scanpos = cg_scanpos*cg_size + scanpos_in_cg;
       if (scanpos > last_scanpos) {
         continue;
       }
       uint32_t blkpos         = scan[scanpos];
-      int32_t q               = quant_coeff[blkpos];
-      double temp             = err_scale[blkpos];
+      int32_t q               = use_scaling_list ? quant_coeff[blkpos] : default_quant_coeff;
+      double temp             = (use_scaling_list ? err_scale[blkpos] : default_error_scale);
       int32_t level_double    = coef[blkpos];
       level_double            = MIN(abs(level_double) * q , MAX_INT - (1 << (q_bits - 1)));
       uint32_t max_abs_level  = (level_double + (1 << (q_bits - 1))) >> q_bits;
       dest_coeff[blkpos] = max_abs_level;
       double err = (double)level_double;
 
-      cost_coeff0[scanpos] = err * err * err_scale[blkpos];
+      cost_coeff0[scanpos] = err * err * (use_scaling_list ? err_scale[blkpos] : default_error_scale);
 
       block_uncoded_cost      += cost_coeff0[ scanpos ];
 
@@ -1698,7 +1706,7 @@ void uvg_rdoq(
             cost_coeffgroup_sig[cg_scanpos] = lambda * CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig], 0);
 
             // reset coeffs to 0 in this block
-            for (int32_t scanpos_in_cg = cg_size - 1; scanpos_in_cg >= 0; scanpos_in_cg--) {
+            for (int32_t scanpos_in_cg = max_scan_group_size; scanpos_in_cg >= 0; scanpos_in_cg--) {
               int32_t  scanpos = cg_scanpos*cg_size + scanpos_in_cg;
               uint32_t blkpos = scan[scanpos];
               if (dest_coeff[blkpos]){
@@ -1751,7 +1759,7 @@ void uvg_rdoq(
     base_cost -= cost_coeffgroup_sig[cg_scanpos];
 
     if (sig_coeffgroup_flag[ cg_blkpos ]) {
-      for ( int32_t scanpos_in_cg = cg_size - 1; scanpos_in_cg >= 0; scanpos_in_cg--) {
+      for ( int32_t scanpos_in_cg = max_scan_group_size; scanpos_in_cg >= 0; scanpos_in_cg--) {
         int32_t  scanpos = cg_scanpos*cg_size + scanpos_in_cg;
         if (scanpos > last_scanpos) continue;
         uint32_t blkpos  = scan[scanpos];

From eae7d72384f1f6b87eb91e9361ebb5d81f9b6d1d Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 21 Dec 2022 11:53:55 +0200
Subject: [PATCH 173/254] [isp] Keep cabac contexts up to date for the
 different isp tus

---
 src/search.c       |  2 +-
 src/search_intra.c | 20 +++++++++++++++-----
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/src/search.c b/src/search.c
index e1e8d971..dab56586 100644
--- a/src/search.c
+++ b/src/search.c
@@ -1313,7 +1313,7 @@ static double search_cu(
     if (can_use_intra && !skip_intra) {
       intra_search.pred_cu = *cur_cu;
       if(tree_type != UVG_CHROMA_T) {
-        uvg_search_cu_intra(state, &intra_search, lcu, tree_type, cu_loc);
+        uvg_search_cu_intra(state, &intra_search, lcu, is_separate_tree ? UVG_LUMA_T : tree_type, cu_loc);
       }
 #ifdef COMPLETE_PRED_MODE_BITS
       // Technically counting these bits would be correct, however counting
diff --git a/src/search_intra.c b/src/search_intra.c
index b4442f1a..c79c5d61 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -298,12 +298,18 @@ static double search_intra_trdepth(
   double split_cost = INT32_MAX;
   double nosplit_cost = INT32_MAX;
 
+  cabac_data_t cabac_data;
+  memcpy(&cabac_data, &state->search_cabac, sizeof(cabac_data_t));
+  state->search_cabac.update = 1;
+
   if (width <= TR_MAX_WIDTH && height <= TR_MAX_WIDTH) {
 
     const bool mts_enabled = (state->encoder_control->cfg.mts == UVG_MTS_INTRA || state->encoder_control->cfg.mts == UVG_MTS_BOTH)
       && PU_IS_TU(pred_cu);
 
     nosplit_cost = 0.0;
+    const bool has_been_split = 1 << pred_cu->log2_width != cu_loc->width ||
+                                1 << pred_cu->log2_height != cu_loc->height;
 
     cbf_clear(&pred_cu->cbf, COLOR_Y);
     if (reconstruct_chroma) {
@@ -345,9 +351,7 @@ static double search_intra_trdepth(
     if(pred_cu->intra.mip_flag && (width < 16 || height < 16)) {
       max_lfnst_idx = 0;
     }
-
-    const bool is_local_dual_tree = pred_cu->log2_width + pred_cu->log2_height < 6 && tree_type == UVG_BOTH_T;
-
+    
     int start_idx = 0;
     int end_idx = state->encoder_control->cfg.lfnst && PU_IS_TU(pred_cu) &&
                   uvg_can_use_isp_with_lfnst(width, height, pred_cu->intra.isp_mode, tree_type) ? max_lfnst_idx : 0;
@@ -431,6 +435,12 @@ static double search_intra_trdepth(
             continue;
           }
         }
+
+        if (!has_been_split) {
+          memcpy(&state->search_cabac, &cabac_data, sizeof(cabac_data));
+          state->search_cabac.update = 1;
+        }
+
         double rd_cost = uvg_cu_rd_cost_luma(
           state,
           cu_loc,
@@ -442,8 +452,7 @@ static double search_intra_trdepth(
             trafo != MTS_SKIP) {
           if (!constraints[0] && constraints[1]) {
             transform_bits += CTX_ENTROPY_FBITS(
-              &state->search_cabac.ctx.lfnst_idx_model[is_local_dual_tree ||
-                tree_type == UVG_LUMA_T],
+              &state->search_cabac.ctx.lfnst_idx_model[tree_type == UVG_LUMA_T],
               lfnst_idx != 0);
             if (lfnst_idx > 0) {
               transform_bits += CTX_ENTROPY_FBITS(
@@ -593,6 +602,7 @@ static double search_intra_trdepth(
       split_cost += search_intra_trdepth(state, &split_cu_loc[i], nosplit_cost, search_data, lcu, tree_type);
     }
   }
+  memcpy(&state->search_cabac, &cabac_data, sizeof(cabac_data));
 
   if (!PU_IS_TU(pred_cu) || split_cost < nosplit_cost) {
     return split_cost;

From b27eca7c37c2697b569ae451a513f6c5971ffe45 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 21 Dec 2022 13:45:56 +0200
Subject: [PATCH 174/254] [deblock] fix width and height to correct order

---
 src/filter.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/filter.c b/src/filter.c
index 3b41bd44..cabc75e3 100644
--- a/src/filter.c
+++ b/src/filter.c
@@ -838,9 +838,9 @@ static void filter_deblock_edge_luma(encoder_state_t * const state,
       int tu_size_q_side = 0;
       if (cu_q->type == CU_INTRA && cu_q->intra.isp_mode != ISP_MODE_NO_ISP) {
         if (cu_q->intra.isp_mode == ISP_MODE_VER && dir == EDGE_VER) {
-          tu_size_q_side = MAX(4, cu_height >> 2);
-        } else if (cu_q->intra.isp_mode == ISP_MODE_HOR && dir == EDGE_HOR) {
           tu_size_q_side = MAX(4, cu_width >> 2);
+        } else if (cu_q->intra.isp_mode == ISP_MODE_HOR && dir == EDGE_HOR) {
+          tu_size_q_side = MAX(4,  cu_height >> 2);
         } else {
           tu_size_q_side = dir == EDGE_HOR ?
                              MIN(1 << cu_q->log2_height, TR_MAX_WIDTH) :
@@ -855,9 +855,9 @@ static void filter_deblock_edge_luma(encoder_state_t * const state,
       int tu_size_p_side = 0;
       if (cu_p->type == CU_INTRA && cu_p->intra.isp_mode != ISP_MODE_NO_ISP) {
         if (cu_p->intra.isp_mode == ISP_MODE_VER && dir == EDGE_VER) {
-          tu_size_p_side = MAX(4, (1 << cu_p->log2_height) >> 2);
-        } else if (cu_p->intra.isp_mode == ISP_MODE_HOR && dir == EDGE_HOR) {
           tu_size_p_side = MAX(4, (1 << cu_p->log2_width) >> 2);
+        } else if (cu_p->intra.isp_mode == ISP_MODE_HOR && dir == EDGE_HOR) {
+          tu_size_p_side = MAX(4, (1 << cu_p->log2_height) >> 2);
         } else {
           tu_size_p_side = dir == EDGE_HOR ?
                              MIN(1 << cu_p->log2_height, TR_MAX_WIDTH) :

From 2d00cab4b988582c8b0e5491618a2b57d88ae215 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 21 Dec 2022 14:33:10 +0200
Subject: [PATCH 175/254] [isp] properly reset cabac context during intra
 search

---
 src/search_intra.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/search_intra.c b/src/search_intra.c
index c79c5d61..f05aa208 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -571,6 +571,7 @@ static double search_intra_trdepth(
     // If the cost of any 1/4th of the transform is already larger than the
     // whole transform, assume that splitting further is a bad idea.
     if (nosplit_cost <= cost_treshold) {
+      memcpy(&state->search_cabac, &cabac_data, sizeof(cabac_data));
       return nosplit_cost;
     }
   }

From bd3ec75173e30e6a6d600370c8121e423bc2fbc8 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Mon, 2 Jan 2023 13:47:40 +0200
Subject: [PATCH 176/254] [mtt] search early terminations

---
 src/search.c | 36 +++++++++++++++++++++++++++++++++++-
 1 file changed, 35 insertions(+), 1 deletion(-)

diff --git a/src/search.c b/src/search.c
index dab56586..5e2c47dd 100644
--- a/src/search.c
+++ b/src/search.c
@@ -1647,6 +1647,7 @@ static double search_cu(
 
   can_split_cu &= can_split[1] || can_split[2] || can_split[3] || can_split[4] || can_split[5];
 
+  bool improved[6] = {false};
 
   // If skip mode was selected for the block, skip further search.
   // Skip mode means there's no coefficients in the block, so splitting
@@ -1654,7 +1655,7 @@ static double search_cu(
   // It is ok to interrupt the search as soon as it is known that
   // the split costs at least as much as not splitting.
   int cbf = cbf_is_set_any(cur_cu->cbf);
-  if (can_split_cu && (cur_cu->type == CU_NOTSET || cbf || state->encoder_control->cfg.cu_split_termination == UVG_CU_SPLIT_TERMINATION_OFF)) {
+  if (can_split_cu && (cur_cu->type == CU_NOTSET || cbf || state->encoder_control->cfg.cu_split_termination == UVG_CU_SPLIT_TERMINATION_OFF || true)) {
     lcu_t * split_lcu = MALLOC(lcu_t, 5);
     enum split_type best_split = 0;
     double best_split_cost = MAX_DOUBLE;
@@ -1667,6 +1668,26 @@ static double search_cu(
         || (tree_type == UVG_CHROMA_T && split_type == TT_HOR_SPLIT && cu_loc->chroma_height == 8)
         || (tree_type == UVG_CHROMA_T && split_type == BT_HOR_SPLIT && cu_loc->chroma_height == 4))
         continue;
+
+      // Best no split has no residual and same direction bt didn't improve so don't try tt
+      if (
+        !cbf && ((!improved[BT_VER_SPLIT] && split_type == TT_VER_SPLIT) ||
+        (!improved[BT_HOR_SPLIT] && split_type == TT_HOR_SPLIT)))
+          continue;
+
+      if (split_type == TT_HOR_SPLIT) {
+        if (LCU_GET_CU_AT_PX(&split_lcu[BT_HOR_SPLIT - 1], x_local, y_local)->log2_height == cur_cu->log2_height - 1 &&
+            LCU_GET_CU_AT_PX(&split_lcu[BT_HOR_SPLIT - 1], x_local, y_local + luma_height / 2)->log2_height == cur_cu->log2_height - 1) {
+          continue;
+        }
+      }
+      if (split_type == TT_VER_SPLIT) {
+        if (LCU_GET_CU_AT_PX(&split_lcu[BT_VER_SPLIT - 1], x_local, y_local)->log2_width == cur_cu->log2_width - 1 &&
+            LCU_GET_CU_AT_PX(&split_lcu[BT_VER_SPLIT - 1], x_local + luma_width / 2, y_local)->log2_width == cur_cu->log2_width - 1) {
+          continue;
+        }
+      }
+
       double split_cost = 0.0;
       memcpy(&state->search_cabac, &pre_search_cabac, sizeof(post_seach_cabac));
 
@@ -1709,6 +1730,8 @@ static double search_cu(
           );
       }
 
+      const double factor    = state->qp > 30 ? 1.1 : 1.075;
+      if (split_bits * state->frame->lambda + cost / factor > cost) continue;
 
       split_tree_t new_split = {
         split_tree.split_tree | split_type << (split_tree.current_depth * 3),
@@ -1721,6 +1744,8 @@ static double search_cu(
       state->search_cabac.update = 0;
       split_cost += split_bits * state->lambda;
 
+      bool stop_to_qt = split_type == QT_SPLIT;
+
       cu_loc_t new_cu_loc[4];
       uint8_t separate_chroma = 0;
       const int splits = uvg_get_split_locs(cu_loc, split_type, new_cu_loc, &separate_chroma);
@@ -1734,17 +1759,26 @@ static double search_cu(
           tree_type, new_split,
           !separate_chroma || (split == splits - 1 && has_chroma));
         // If there is no separate chroma the block will always have chroma, otherwise it is the last block of the split that has the chroma
+
+        if (split_type == QT_SPLIT) {
+          const cu_info_t * const t = LCU_GET_CU_AT_PX(&split_lcu[0], new_cu_loc[split].local_x, new_cu_loc[split].local_y);
+          stop_to_qt &= t->log2_height == cur_cu->log2_height - 1 && t->log2_width == cur_cu->log2_width;
+        }
+
         if (split_cost > cost || split_cost > best_split_cost) {
+          stop_to_qt = false;
           break;
         }
       }
 
+      improved[split_type] = cost > split_cost;
       
       if (split_cost < best_split_cost) {
         best_split_cost = split_cost;
         best_split = split_type;
         memcpy(&best_split_cabac, &state->search_cabac, sizeof(cabac_data_t));
       }
+      if (stop_to_qt) break;
     }
 
     // If no search is not performed for this depth, try just the best mode

From 0c63743fc0fde14e70687f14dcb506a21d8b5f1a Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 4 Jan 2023 09:34:34 +0200
Subject: [PATCH 177/254] [mtt] Early terminations for all intra

---
 src/search.c | 73 +++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 52 insertions(+), 21 deletions(-)

diff --git a/src/search.c b/src/search.c
index 5e2c47dd..5afc6b45 100644
--- a/src/search.c
+++ b/src/search.c
@@ -1159,6 +1159,38 @@ static void mark_deblocking(const cu_loc_t* const cu_loc, const cu_loc_t* const
   }
 }
 
+static bool check_for_early_termission(const int cu_width, const int cu_height, cu_info_t* cur_cu, int x_local, int y_local, bool improved[6], int cbf, lcu_t* split_lcu, int split_type)
+{
+  // Best no split has no residual and same direction bt didn't improve so don't try tt
+  // 3.11
+  if (
+    !cbf && ((!improved[BT_VER_SPLIT] && split_type == TT_VER_SPLIT) ||
+             (!improved[BT_HOR_SPLIT] && split_type == TT_HOR_SPLIT)))
+    return true;
+
+
+  // 3.8
+  if (split_type == TT_HOR_SPLIT) {
+    bool can_skip = true;
+    for (int x_scu = x_local; x_scu < x_local + cu_width; x_scu += 4) {
+      can_skip &=
+        LCU_GET_CU_AT_PX(&split_lcu[BT_HOR_SPLIT - 1], x_scu, y_local)->log2_height == cur_cu->log2_height - 1 &&
+        LCU_GET_CU_AT_PX(&split_lcu[BT_HOR_SPLIT - 1], x_scu, y_local + cu_height / 2)->log2_height == cur_cu->log2_height - 1;
+    }
+    if (can_skip) return true;
+  }
+  if (split_type == TT_VER_SPLIT) {
+    bool can_skip = true;
+    for (int y_scu = y_local; y_scu < y_local + cu_height; y_scu += 4) {
+      can_skip &=
+        LCU_GET_CU_AT_PX(&split_lcu[BT_VER_SPLIT - 1], x_local, y_scu)->log2_width == cur_cu->log2_width - 1 &&
+        LCU_GET_CU_AT_PX(&split_lcu[BT_VER_SPLIT - 1], x_local + cu_width / 2, y_scu)->log2_width == cur_cu->log2_width - 1;
+    }
+    if (can_skip) return true;
+  }
+  return false;
+}
+
 /**
  * Search every mode from 0 to MAX_PU_DEPTH and return cost of best mode.
  * - The recursion is started at depth 0 and goes in Z-order to MAX_PU_DEPTH.
@@ -1655,6 +1687,12 @@ static double search_cu(
   // It is ok to interrupt the search as soon as it is known that
   // the split costs at least as much as not splitting.
   int cbf = cbf_is_set_any(cur_cu->cbf);
+
+  // 3.13
+  if ((cu_height < 32 || cu_width < 32) && cur_cu->type != CU_NOTSET  && !cbf && split_tree.mtt_depth > 1 && tree_type != UVG_CHROMA_T) {
+    can_split_cu = false;
+  }
+
   if (can_split_cu && (cur_cu->type == CU_NOTSET || cbf || state->encoder_control->cfg.cu_split_termination == UVG_CU_SPLIT_TERMINATION_OFF || true)) {
     lcu_t * split_lcu = MALLOC(lcu_t, 5);
     enum split_type best_split = 0;
@@ -1669,24 +1707,16 @@ static double search_cu(
         || (tree_type == UVG_CHROMA_T && split_type == BT_HOR_SPLIT && cu_loc->chroma_height == 4))
         continue;
 
-      // Best no split has no residual and same direction bt didn't improve so don't try tt
-      if (
-        !cbf && ((!improved[BT_VER_SPLIT] && split_type == TT_VER_SPLIT) ||
-        (!improved[BT_HOR_SPLIT] && split_type == TT_HOR_SPLIT)))
-          continue;
-
-      if (split_type == TT_HOR_SPLIT) {
-        if (LCU_GET_CU_AT_PX(&split_lcu[BT_HOR_SPLIT - 1], x_local, y_local)->log2_height == cur_cu->log2_height - 1 &&
-            LCU_GET_CU_AT_PX(&split_lcu[BT_HOR_SPLIT - 1], x_local, y_local + luma_height / 2)->log2_height == cur_cu->log2_height - 1) {
-          continue;
-        }
-      }
-      if (split_type == TT_VER_SPLIT) {
-        if (LCU_GET_CU_AT_PX(&split_lcu[BT_VER_SPLIT - 1], x_local, y_local)->log2_width == cur_cu->log2_width - 1 &&
-            LCU_GET_CU_AT_PX(&split_lcu[BT_VER_SPLIT - 1], x_local + luma_width / 2, y_local)->log2_width == cur_cu->log2_width - 1) {
-          continue;
-        }
-      }
+      if (check_for_early_termission(
+        cu_width,
+        cu_height,
+        cur_cu,
+        x_local,
+        y_local,
+        improved,
+        cbf,
+        split_lcu,
+        split_type)) continue;
 
       double split_cost = 0.0;
       memcpy(&state->search_cabac, &pre_search_cabac, sizeof(post_seach_cabac));
@@ -1730,6 +1760,7 @@ static double search_cu(
           );
       }
 
+      // 3.9
       const double factor    = state->qp > 30 ? 1.1 : 1.075;
       if (split_bits * state->frame->lambda + cost / factor > cost) continue;
 
@@ -1744,7 +1775,8 @@ static double search_cu(
       state->search_cabac.update = 0;
       split_cost += split_bits * state->lambda;
 
-      bool stop_to_qt = split_type == QT_SPLIT;
+      // 3.7
+      bool stop_to_qt = false;
 
       cu_loc_t new_cu_loc[4];
       uint8_t separate_chroma = 0;
@@ -1762,11 +1794,10 @@ static double search_cu(
 
         if (split_type == QT_SPLIT) {
           const cu_info_t * const t = LCU_GET_CU_AT_PX(&split_lcu[0], new_cu_loc[split].local_x, new_cu_loc[split].local_y);
-          stop_to_qt &= t->log2_height == cur_cu->log2_height - 1 && t->log2_width == cur_cu->log2_width;
+          stop_to_qt |= GET_SPLITDATA(t, depth + 1) == QT_SPLIT;
         }
 
         if (split_cost > cost || split_cost > best_split_cost) {
-          stop_to_qt = false;
           break;
         }
       }

From d3f42949a75e862e528bd6ce59ef0deb0cdae3bf Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 4 Jan 2023 09:55:22 +0200
Subject: [PATCH 178/254] [mtt] Only consider termination if the cu is
 completely inside the frame

---
 src/search.c | 46 ++++++++++++++++++++++++++++++----------------
 1 file changed, 30 insertions(+), 16 deletions(-)

diff --git a/src/search.c b/src/search.c
index 5afc6b45..08d046f1 100644
--- a/src/search.c
+++ b/src/search.c
@@ -1159,7 +1159,12 @@ static void mark_deblocking(const cu_loc_t* const cu_loc, const cu_loc_t* const
   }
 }
 
-static bool check_for_early_termission(const int cu_width, const int cu_height, cu_info_t* cur_cu, int x_local, int y_local, bool improved[6], int cbf, lcu_t* split_lcu, int split_type)
+static bool check_for_early_termission(const int cu_width, const int cu_height, const cu_info_t* const cur_cu, int x_local, int y_local, const
+                                       bool* improved,
+                                       int cbf,
+                                       lcu_t* split_lcu,
+                                       int split_type,
+                                       const bool* can_split)
 {
   // Best no split has no residual and same direction bt didn't improve so don't try tt
   // 3.11
@@ -1170,7 +1175,7 @@ static bool check_for_early_termission(const int cu_width, const int cu_height,
 
 
   // 3.8
-  if (split_type == TT_HOR_SPLIT) {
+  if (split_type == TT_HOR_SPLIT && can_split[BT_HOR_SPLIT]) {
     bool can_skip = true;
     for (int x_scu = x_local; x_scu < x_local + cu_width; x_scu += 4) {
       can_skip &=
@@ -1179,7 +1184,7 @@ static bool check_for_early_termission(const int cu_width, const int cu_height,
     }
     if (can_skip) return true;
   }
-  if (split_type == TT_VER_SPLIT) {
+  if (split_type == TT_VER_SPLIT && can_split[BT_VER_SPLIT]) {
     bool can_skip = true;
     for (int y_scu = y_local; y_scu < y_local + cu_height; y_scu += 4) {
       can_skip &=
@@ -1292,9 +1297,10 @@ static double search_cu(
 
   intra_search_data_t intra_search;
 
+  const bool completely_inside = x + luma_width <= frame_width && y + luma_height <= frame_height;
   // If the CU is completely inside the frame at this depth, search for
   // prediction modes at this depth.
-  if ( x + luma_width <= frame_width && y + luma_height <= frame_height)
+  if ( completely_inside)
   {
     int cu_width_inter_min = LCU_WIDTH >> pu_depth_inter.max;
     bool can_use_inter =
@@ -1707,16 +1713,20 @@ static double search_cu(
         || (tree_type == UVG_CHROMA_T && split_type == BT_HOR_SPLIT && cu_loc->chroma_height == 4))
         continue;
 
-      if (check_for_early_termission(
-        cu_width,
-        cu_height,
-        cur_cu,
-        x_local,
-        y_local,
-        improved,
-        cbf,
-        split_lcu,
-        split_type)) continue;
+      if (completely_inside && check_for_early_termission(
+            cu_width,
+            cu_height,
+            cur_cu,
+            x_local,
+            y_local,
+            improved,
+            cbf,
+            split_lcu,
+            split_type,
+            can_split)) {
+        can_split[split_type] = false;
+        continue;
+      }
 
       double split_cost = 0.0;
       memcpy(&state->search_cabac, &pre_search_cabac, sizeof(post_seach_cabac));
@@ -1762,7 +1772,10 @@ static double search_cu(
 
       // 3.9
       const double factor    = state->qp > 30 ? 1.1 : 1.075;
-      if (split_bits * state->frame->lambda + cost / factor > cost) continue;
+      if (split_bits * state->frame->lambda + cost / factor > cost) {
+        can_split[split_type] = false;
+        continue;
+      }
 
       split_tree_t new_split = {
         split_tree.split_tree | split_type << (split_tree.current_depth * 3),
@@ -1792,12 +1805,13 @@ static double search_cu(
           !separate_chroma || (split == splits - 1 && has_chroma));
         // If there is no separate chroma the block will always have chroma, otherwise it is the last block of the split that has the chroma
 
-        if (split_type == QT_SPLIT) {
+        if (split_type == QT_SPLIT && completely_inside) {
           const cu_info_t * const t = LCU_GET_CU_AT_PX(&split_lcu[0], new_cu_loc[split].local_x, new_cu_loc[split].local_y);
           stop_to_qt |= GET_SPLITDATA(t, depth + 1) == QT_SPLIT;
         }
 
         if (split_cost > cost || split_cost > best_split_cost) {
+          can_split[split_type] = false;
           break;
         }
       }

From 1373a7ac1dd098e08826653bbef67d2c5e8af550 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 4 Jan 2023 11:19:01 +0200
Subject: [PATCH 179/254] [mtt] correct indexing for chroma tree

---
 src/search.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/search.c b/src/search.c
index 08d046f1..b8bb7a63 100644
--- a/src/search.c
+++ b/src/search.c
@@ -1806,7 +1806,10 @@ static double search_cu(
         // If there is no separate chroma the block will always have chroma, otherwise it is the last block of the split that has the chroma
 
         if (split_type == QT_SPLIT && completely_inside) {
-          const cu_info_t * const t = LCU_GET_CU_AT_PX(&split_lcu[0], new_cu_loc[split].local_x, new_cu_loc[split].local_y);
+          const cu_info_t * const t = LCU_GET_CU_AT_PX(
+            &split_lcu[0],
+            new_cu_loc[split].local_x >> (tree_type == UVG_CHROMA_T),
+            new_cu_loc[split].local_y >> (tree_type == UVG_CHROMA_T));
           stop_to_qt |= GET_SPLITDATA(t, depth + 1) == QT_SPLIT;
         }
 

From 2a33af283ed6e6acfe0bb5e52830b57c05214771 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Thu, 5 Jan 2023 14:21:43 +0200
Subject: [PATCH 180/254] [DepQuant] WIP: initialization done

---
 src/dep_quant.c                        | 476 +++++++++++++++++++++++++
 src/dep_quant.h                        |  40 +++
 src/rdo.c                              |   1 -
 src/rdo.h                              |   2 +
 src/strategies/generic/quant-generic.c |   1 -
 src/strategies/generic/quant-generic.h |   2 -
 6 files changed, 518 insertions(+), 4 deletions(-)
 create mode 100644 src/dep_quant.c
 create mode 100644 src/dep_quant.h

diff --git a/src/dep_quant.c b/src/dep_quant.c
new file mode 100644
index 00000000..47314f48
--- /dev/null
+++ b/src/dep_quant.c
@@ -0,0 +1,476 @@
+/*****************************************************************************
+ * This file is part of uvg266 VVC encoder.
+ *
+ * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ * 
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ * 
+ * * Redistributions in binary form must reproduce the above copyright notice, this
+ *   list of conditions and the following disclaimer in the documentation and/or
+ *   other materials provided with the distribution.
+ * 
+ * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
+ ****************************************************************************/
+
+#include "dep_quant.h"
+
+#include "cu.h"
+#include "encoderstate.h"
+#include "intra.h"
+#include "rdo.h"
+#include "transform.h"
+#include "uvg_math.h"
+#include "generic/quant-generic.h"
+
+
+#define sm_numCtxSetsSig 3
+#define sm_numCtxSetsGtx 2
+#define sm_maxNumSigSbbCtx 2
+#define sm_maxNumSigCtx    12
+#define sm_maxNumGtxCtx    21
+#define SCALE_BITS 15
+
+
+typedef struct {
+  int     m_QShift;
+  int64_t m_QAdd;
+  int64_t m_QScale;
+  coeff_t  m_maxQIdx;
+  coeff_t m_thresLast;
+  coeff_t  m_thresSSbb;
+  // distortion normalization
+  int     m_DistShift;
+  int64_t m_DistAdd;
+  int64_t m_DistStepAdd;
+  int64_t m_DistOrgFact;
+} quant_block;
+
+typedef struct {
+  uint8_t num;
+  uint8_t inPos[5];
+} NbInfoSbb;
+
+typedef struct {
+  uint16_t maxDist;
+  uint16_t num;
+  uint16_t outPos[5];
+} NbInfoOut;
+
+typedef struct {
+  uint8_t* sbbFlags;
+  uint8_t* levels;
+} SbbCtx;
+
+typedef struct {
+  const NbInfoOut* m_nbInfo;
+  uint32_t      m_sbbFlagBits[2][2];
+  SbbCtx           m_allSbbCtx[8];
+  SbbCtx*          m_currSbbCtx;
+  SbbCtx*          m_prevSbbCtx;
+  uint8_t          m_memory[8 * (TR_MAX_WIDTH * TR_MAX_WIDTH + 1024)];
+} common_context;
+
+
+typedef struct 
+{
+  int32_t            m_lastBitsX[TR_MAX_WIDTH];
+  int32_t               m_lastBitsY[TR_MAX_WIDTH];
+  uint32_t        m_sigSbbFracBits[sm_maxNumSigSbbCtx][2];
+  uint32_t        m_sigFracBits[sm_numCtxSetsSig][sm_maxNumSigCtx][2];
+  int32_t      m_gtxFracBits[sm_maxNumGtxCtx][6];
+  
+} rate_estimator;
+
+
+typedef struct {
+  int64_t                    m_rdCost;
+  uint16_t                   m_absLevelsAndCtxInit[24]; // 16x8bit for abs levels + 16x16bit for ctx init id
+  int8_t                     m_numSigSbb;
+  int                        m_remRegBins;
+  int8_t                     m_refSbbCtxId;
+  uint32_t                   m_sbbFracBits[2];
+  uint32_t                   m_sigFracBits[2];
+  int32_t                   m_coeffFracBits[6];
+  int8_t                     m_goRicePar;
+  int8_t                     m_goRiceZero;
+  int8_t               m_stateId;
+  const uint32_t*       m_sigFracBitsArray;
+  const uint32_t*       m_gtxFracBitsArray;
+  common_context*            m_commonCtx;
+  
+  unsigned effWidth;
+  unsigned effHeight;
+} depquant_state;
+
+
+static void init_quant_block(
+  const encoder_state_t* state,
+  quant_block*           qp,
+  const cu_info_t* const cur_tu,
+  unsigned               log2_width,
+  unsigned               log2_height,
+  color_t                color,
+  const bool             needsSqrt2ScaleAdjustment,
+  const int              gValue)
+{
+  double     lambda = state->lambda;
+
+  const int  qpDQ = state->qp + 1;
+  const int  qpPer = qpDQ / 6;
+  const int  qpRem = qpDQ - 6 * qpPer;
+  const int  channelBitDepth = state->encoder_control->bitdepth;
+  const int  maxLog2TrDynamicRange = MAX_TR_DYNAMIC_RANGE;
+  const int  nomTransformShift = MAX_TR_DYNAMIC_RANGE - channelBitDepth - ((log2_width + log2_height) >> 1);
+  const bool clipTransformShift = (cur_tu->tr_skip >> color) & 1 && false; // extended precision
+  const int  transformShift =
+    (clipTransformShift ? MAX(0, nomTransformShift) :
+                          nomTransformShift) +
+    (needsSqrt2ScaleAdjustment ? -1 : 0);
+  // quant parameters
+  qp->m_QShift = QUANT_SHIFT - 1 + qpPer + transformShift;
+  qp->m_QAdd = -((3 << qp->m_QShift) >> 1);
+  int invShift = IQUANT_SHIFT + 1 - qpPer - transformShift;
+  qp->m_QScale = uvg_g_quant_scales[needsSqrt2ScaleAdjustment ? 1 : 0][qpRem];
+  const unsigned qIdxBD = MIN(
+    maxLog2TrDynamicRange + 1,
+    8 * sizeof(int) + invShift - IQUANT_SHIFT - 1);
+  qp->m_maxQIdx = (1 << (qIdxBD - 1)) - 4;
+  qp->m_thresLast = (coeff_t)(((int64_t)(4) << qp->m_QShift));
+  qp->m_thresSSbb = (coeff_t)(((int64_t)(3) << qp->m_QShift));
+  // distortion calculation parameters
+  const int64_t qScale = (gValue == -1) ? qp->m_QScale : gValue;
+  const int     nomDShift =
+    15 -
+    2 * (nomTransformShift) +
+    qp->m_QShift + (needsSqrt2ScaleAdjustment ? 1 : 0);
+  const double qScale2 = (double)(qScale * qScale);
+  const double nomDistFactor =
+    (nomDShift < 0 ?
+       1.0 / ((double)((int64_t)(1) << (-nomDShift)) * qScale2 * lambda) :
+       (double)((int64_t)(1) << nomDShift) / (qScale2 * lambda));
+  const int64_t pow2dfShift = (int64_t)(nomDistFactor * qScale2) + 1;
+  assert(pow2dfShift > 0xfffffffll);
+  const int dfShift = uvg_math_ceil_log2(pow2dfShift);
+  qp->m_DistShift = 62 + qp->m_QShift - 2 * maxLog2TrDynamicRange - dfShift;
+  qp->m_DistAdd = ((int64_t)(1) << qp->m_DistShift) >> 1;
+  qp->m_DistStepAdd = (int64_t)(nomDistFactor * (double)((int64_t)(1) << (qp->m_DistShift + qp->m_QShift)) + .5);
+  qp->m_DistOrgFact = (int64_t)(nomDistFactor * (double)((int64_t)(1) << (qp->m_DistShift + 1)) + .5);
+}
+
+static void reset_common_context(common_context* ctx, const rate_estimator * rate_estimator, int numSbb, int num_coeff)
+{
+  memset(&ctx->m_nbInfo, 0, sizeof(ctx->m_nbInfo));
+  memcpy(&ctx->m_sbbFlagBits, &rate_estimator->m_sigSbbFracBits, sizeof(rate_estimator->m_sigSbbFracBits));
+  const int chunkSize = numSbb + num_coeff;
+  uint8_t*  nextMem   = ctx->m_memory;
+  for (int k = 0; k < 8; k++, nextMem += chunkSize) {
+    ctx->m_allSbbCtx[k].sbbFlags = nextMem;
+    ctx->m_allSbbCtx[k].levels   = nextMem + numSbb;
+  }
+}
+
+static void init_rate_esimator(rate_estimator * rate_estimator, const cabac_data_t * const ctx, color_t color)
+{
+  const cabac_ctx_t * base_ctx = color == COLOR_Y ? ctx->ctx.sig_coeff_group_model : (ctx->ctx.sig_coeff_group_model + 2);
+  for (unsigned ctxId = 0; ctxId < sm_maxNumSigSbbCtx; ctxId++) {
+    rate_estimator->m_sigSbbFracBits[ctxId][0] = CTX_ENTROPY_BITS(&base_ctx[ctxId], 0);
+    rate_estimator->m_sigSbbFracBits[ctxId][1] = CTX_ENTROPY_BITS(&base_ctx[ctxId], 1);
+  }
+  unsigned numCtx = (color == COLOR_Y ? 12 : 8);
+  for (unsigned ctxSetId = 0; ctxSetId < sm_numCtxSetsSig; ctxSetId++) {
+    base_ctx = color == COLOR_Y ? ctx->ctx.cu_sig_model_luma[ctxSetId] : ctx->ctx.cu_sig_model_chroma[ctxSetId];
+    for (unsigned ctxId = 0; ctxId < numCtx; ctxId++) {
+      rate_estimator->m_sigFracBits[ctxSetId][ctxId][0] = CTX_ENTROPY_BITS(&base_ctx[ctxId], 0);
+      rate_estimator->m_sigFracBits[ctxSetId][ctxId][1] = CTX_ENTROPY_BITS(&base_ctx[ctxId], 1);
+    }
+  }
+  
+  numCtx    = (color == COLOR_Y? 21 : 11);
+  for (unsigned ctxId = 0; ctxId < numCtx; ctxId++) {
+    const cabac_ctx_t * par_ctx = color == COLOR_Y ? &ctx->ctx.cu_parity_flag_model_luma[ctxId] : &ctx->ctx.cu_parity_flag_model_chroma[ctxId];
+    const cabac_ctx_t * gt1_ctx = color == COLOR_Y ? &ctx->ctx.cu_gtx_flag_model_luma[0][ctxId] : &ctx->ctx.cu_gtx_flag_model_chroma[0][ctxId];
+    const cabac_ctx_t * gt2_ctx = color == COLOR_Y ? &ctx->ctx.cu_gtx_flag_model_luma[1][ctxId] : &ctx->ctx.cu_gtx_flag_model_chroma[1][ctxId];
+
+    int32_t* cb = &rate_estimator->m_gtxFracBits[ctxId];
+    int32_t par0    = (1 << SCALE_BITS) + (int32_t)CTX_ENTROPY_BITS(par_ctx, 0);
+    int32_t par1 = (1 << SCALE_BITS) + (int32_t)CTX_ENTROPY_BITS(par_ctx, 1);
+    cb[0] = 0;
+    cb[1] = CTX_ENTROPY_BITS(gt1_ctx, 0) + (1 << SCALE_BITS);
+    cb[2] = CTX_ENTROPY_BITS(gt1_ctx, 1) + par0 + CTX_ENTROPY_BITS(gt2_ctx, 0);
+    cb[3] = CTX_ENTROPY_BITS(gt1_ctx, 1) + par1 + CTX_ENTROPY_BITS(gt2_ctx, 0);
+    cb[4] = CTX_ENTROPY_BITS(gt1_ctx, 1) + par0 + CTX_ENTROPY_BITS(gt2_ctx, 1);
+    cb[5] = CTX_ENTROPY_BITS(gt1_ctx, 1) + par1 + CTX_ENTROPY_BITS(gt2_ctx, 1);
+  }
+}
+
+
+  static void xSetLastCoeffOffset(
+  const encoder_state_t* const state,
+  const cu_info_t* const       cur_tu,
+  const cu_loc_t* const        cu_loc,
+      rate_estimator* rate_estimator,
+      const bool cb_cbf,
+  const color_t compID)
+{
+  int32_t cbfDeltaBits = 0;
+  if (compID == COLOR_Y && cur_tu->type != CU_INTRA /*&& !tu.depth*/) {
+    cbfDeltaBits = (int32_t)CTX_ENTROPY_BITS(&state->search_cabac.ctx.cu_qt_root_cbf_model, 1) - (int32_t)CTX_ENTROPY_BITS(&state->search_cabac.ctx.cu_qt_root_cbf_model, 0);
+  } else {
+    bool        prevLumaCbf           = false;
+    bool        lastCbfIsInferred     = false;
+    bool        useIntraSubPartitions = cur_tu->type == CU_INTRA && cur_tu->intra.isp_mode && compID == COLOR_Y;
+    if (useIntraSubPartitions) {
+      bool     rootCbfSoFar       = false;
+      bool     isLastSubPartition = false; //TODO: isp check
+      uint32_t nTus = uvg_get_isp_split_num(cu_loc->width, cu_loc->height, cur_tu->intra.isp_mode, true);
+      if (isLastSubPartition) {
+        //TransformUnit* tuPointer = tu.cu->firstTU;
+        //for (int tuIdx = 0; tuIdx < nTus - 1; tuIdx++) {
+        //  rootCbfSoFar |= TU::getCbfAtDepth(*tuPointer, COMPONENT_Y, tu.depth);
+        //  tuPointer = tuPointer->next;
+        //}
+        if (!rootCbfSoFar) {
+          lastCbfIsInferred = true;
+        }
+      }
+      if (!lastCbfIsInferred) {
+        prevLumaCbf = false;
+      }
+      const cabac_ctx_t * const cbf_ctx = &state->search_cabac.ctx.qt_cbf_model_luma[2 + prevLumaCbf];
+      cbfDeltaBits = lastCbfIsInferred ? 0 : (int32_t)CTX_ENTROPY_BITS(cbf_ctx, 1) - (int32_t)CTX_ENTROPY_BITS(cbf_ctx, 0);
+    } else {
+      const cabac_ctx_t* cbf_ctx;
+      switch (compID) {
+        case COLOR_Y:
+          cbf_ctx = &state->search_cabac.ctx.qt_cbf_model_luma[0];
+          break;
+        case COLOR_U:
+          cbf_ctx = &state->search_cabac.ctx.qt_cbf_model_cb[0];
+          break;
+        case COLOR_V:
+          cbf_ctx = &state->search_cabac.ctx.qt_cbf_model_cr[cb_cbf];
+          break;
+      }
+      cbfDeltaBits = (int32_t)CTX_ENTROPY_BITS(cbf_ctx, 1) - (int32_t)CTX_ENTROPY_BITS(cbf_ctx, 0);
+    }
+     
+  }
+
+static const unsigned prefixCtx[] = {0, 0, 0, 3, 6, 10, 15, 21};
+  uint32_t              ctxBits[14];
+  for (unsigned xy = 0; xy < 2; xy++) {
+    int32_t        bitOffset  = (xy ? cbfDeltaBits : 0);
+    int32_t*       lastBits   = (xy ? rate_estimator->m_lastBitsY : rate_estimator->m_lastBitsX);
+    const unsigned size = (xy ? (compID == COLOR_Y ? cu_loc->height : cu_loc->chroma_height) : (compID == COLOR_Y ? cu_loc->width : cu_loc->chroma_width));
+    const unsigned log2Size   = uvg_math_ceil_log2(size);
+    const bool     useYCtx    = (xy != 0);
+    const cabac_ctx_t* const ctxSetLast = useYCtx ?
+        (compID == COLOR_Y ? state->search_cabac.ctx.cu_ctx_last_y_luma : state->search_cabac.ctx.cu_ctx_last_y_chroma) :
+        (compID == COLOR_Y ? state->search_cabac.ctx.cu_ctx_last_x_luma : state->search_cabac.ctx.cu_ctx_last_x_chroma);
+    const unsigned lastShift = (compID == COLOR_Y ? (log2Size + 1) >> 2 : CLIP(0, 2, size >> 3));
+    const unsigned lastOffset = (compID == COLOR_Y ? (prefixCtx[log2Size]) : 0);
+    uint32_t sumFBits = 0;
+    unsigned maxCtxId = g_group_idx[MIN(32, size) - 1];
+    for (unsigned ctxId = 0; ctxId < maxCtxId; ctxId++) {
+      ctxBits[ctxId] = sumFBits
+        + CTX_ENTROPY_BITS(&ctxSetLast[lastOffset + (ctxId >> lastShift)], 0)
+        + (ctxId > 3 ? ((ctxId - 2) >> 1) << SCALE_BITS : 0)
+        + bitOffset;
+      sumFBits += CTX_ENTROPY_BITS(&ctxSetLast[lastOffset + (ctxId >> lastShift)], 1);
+    }
+    ctxBits[maxCtxId] = sumFBits + (maxCtxId > 3 ? ((maxCtxId - 2) >> 1) << SCALE_BITS : 0) + bitOffset;
+    for (unsigned pos = 0; pos < MIN(32, size); pos++) {
+      lastBits[pos] = ctxBits[g_group_idx[pos]];
+    }
+  }
+}
+
+
+static void depquant_state_init(depquant_state* state, uint32_t sig_frac_bits[2], uint32_t gtx_frac_bits[6])
+{
+  state->m_rdCost = INT64_MAX;
+  state->m_numSigSbb = 0;
+  state->m_remRegBins = 4; // just large enough for last scan pos
+  state->m_refSbbCtxId = -1;
+  state->m_sigFracBits[0] = sig_frac_bits[0];
+  state->m_sigFracBits[1] = sig_frac_bits[1];
+  memcpy(state->m_coeffFracBits, gtx_frac_bits, sizeof(gtx_frac_bits));
+  state->m_goRicePar = 0;
+  state->m_goRiceZero = 0;
+}
+
+uint8_t uvg_dep_quant(
+  const encoder_state_t* const state,
+  const cu_info_t* const cur_tu,
+  const cu_loc_t* const cu_loc,
+  const coeff_t* srcCoeff,
+  const coeff_t* coeff_out,
+  const color_t compID,
+  enum uvg_tree_type tree_type,
+  const double lambda,
+  coeff_t* absSum,
+  const bool enableScalingLists)
+{
+  const encoder_control_t* const encoder = state->encoder_control;
+  //===== reset / pre-init =====
+  const int baseLevel = 4;
+  
+  const uint32_t  width           = compID == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
+  const uint32_t  height          = compID == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
+  const uint32_t  lfnstIdx = tree_type != UVG_CHROMA_T  || compID == COLOR_Y ?
+                               cur_tu->lfnst_idx :
+                               cur_tu->cr_lfnst_idx;
+
+  const int       numCoeff = width * height;
+
+  memset(coeff_out, 0x00, width * height * sizeof(coeff_t));
+  *absSum                    = 0;
+
+  const bool      is_mts   = compID == COLOR_Y && cur_tu->tr_idx > MTS_SKIP;
+  const bool      is_ts    = cur_tu->tr_skip >> compID & 1;
+
+    const uint32_t  log2_tr_width  = uvg_g_convert_to_log2[width];
+  const uint32_t  log2_tr_height = uvg_g_convert_to_log2[height];
+  const uint32_t* const scan     = uvg_get_scan_order_table(SCAN_GROUP_4X4,0,log2_tr_width,log2_tr_height);
+
+  int32_t qp_scaled = uvg_get_scaled_qp(compID, state->qp, (encoder->bitdepth - 8) * 6, encoder->qp_map[0]);
+  qp_scaled = is_ts ? MAX(qp_scaled, 4 + 6 * MIN_QP_PRIME_TS) : qp_scaled;
+  bool needs_block_size_trafo_scale = is_ts && ((log2_tr_height + log2_tr_width) % 2 == 1);
+  needs_block_size_trafo_scale |= 0; // Non log2 block size
+
+    const int32_t scalinglist_type = (cur_tu->type == CU_INTRA ? 0 : 3) + (int8_t)compID;
+  const int32_t *q_coeff = encoder->scaling_list.quant_coeff[log2_tr_width][log2_tr_height][scalinglist_type][qp_scaled % 6];
+  const int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_height + log2_tr_width) >> 1) - needs_block_size_trafo_scale; //!< Represents scaling through forward transform
+  const int64_t q_bits = QUANT_SHIFT + qp_scaled / 6 + (is_ts ? 0 : transform_shift );
+  const int32_t add = ((state->frame->slicetype == UVG_SLICE_I) ? 171 : 85) << (q_bits - 9);
+
+  quant_block   quant_block;
+  init_quant_block(state, &quant_block, cur_tu, log2_tr_width, log2_tr_height, compID, needs_block_size_trafo_scale, -1);
+
+  //===== scaling matrix ====
+  //const int         qpDQ = cQP.Qp + 1;
+  //const int         qpPer = qpDQ / 6;
+  //const int         qpRem = qpDQ - 6 * qpPer;
+
+  //TCoeff thresTmp = thres;
+  bool            zeroOut         = false;
+  bool            zeroOutforThres = false;
+  int             effWidth = width, effHeight = height;
+  if (
+    (is_mts ||
+     (state->encoder_control->cfg.mts && 1 /*sbt not used by block*/ &&
+      height <= 32 && width <= 32)) &&
+    compID == COLOR_Y) {
+    effHeight = (height == 32) ? 16 : height;
+    effWidth  = (width == 32) ? 16 : width;
+    zeroOut   = (effHeight < height || effWidth < width);
+  }
+  zeroOutforThres  = zeroOut || (32 < height || 32 < width);
+  //===== find first test position =====
+  int firstTestPos = numCoeff - 1;
+  if (
+    lfnstIdx > 0 && !is_ts && width >= 4 &&
+    height >= 4) {
+    firstTestPos =
+      ((width == 4 && height == 4) || (width == 8 && height == 8)) ? 7 : 15;
+  }
+  const int32_t default_quant_coeff = uvg_g_quant_scales[needs_block_size_trafo_scale][qp_scaled % 6];
+  const coeff_t thres                          = 4 << q_bits;
+  for (; firstTestPos >= 0; firstTestPos--) {
+    coeff_t thresTmp = (enableScalingLists) ? (thres / (4 * q_coeff[firstTestPos])) :(thres / (4 * default_quant_coeff));
+    if (abs(srcCoeff[firstTestPos]) > thresTmp) {
+      break;
+    }
+  }
+  if (firstTestPos < 0) {
+    return 0;
+  }
+
+  //===== real init =====
+  rate_estimator rate_estimator;
+  init_rate_esimator(&rate_estimator, &state->search_cabac, compID);
+  xSetLastCoeffOffset(state, cur_tu, cu_loc, &rate_estimator, cbf_is_set(cur_tu->cbf, COLOR_U), compID);
+  common_context common_context;
+  reset_common_context(&common_context, &rate_estimator, (width * height) >> 4, numCoeff);
+  depquant_state all_state[12];
+  depquant_state start_state;
+
+
+  for (int k = 0; k < 12; k++) {
+    depquant_state_init(&all_state[k], rate_estimator.m_sigFracBits[0][0], rate_estimator.m_gtxFracBits[0]);
+    all_state[k].effHeight = MIN(32, effHeight);
+    all_state[k].effWidth = MIN(32, effWidth);
+  }
+  depquant_state_init(&start_state, rate_estimator.m_sigFracBits[0][0], rate_estimator.m_gtxFracBits[0]);
+  start_state.effHeight = MIN(32, effHeight);
+  start_state.effWidth = MIN(32, effWidth);
+  
+  //===== populate trellis =====
+  for (int scanIdx = firstTestPos; scanIdx >= 0; scanIdx--) {
+    const ScanInfo& scanInfo = tuPars.m_scanInfo[scanIdx];
+    if (enableScalingLists) {
+      m_quant.initQuantBlock(
+        tu,
+        compID,
+        cQP,
+        lambda,
+        quantCoeff[scanInfo.rasterPos]);
+      xDecideAndUpdate(
+        abs(tCoeff[scanInfo.rasterPos]),
+        scanInfo,
+        (zeroOut && (scanInfo.posX >= effWidth || scanInfo.posY >= effHeight)),
+        quantCoeff[scanInfo.rasterPos],
+        effectWidth,
+        effectHeight,
+        tu.cu->slice->getReverseLastSigCoeffFlag());
+    } else {
+      xDecideAndUpdate(
+        abs(tCoeff[scanInfo.rasterPos]),
+        scanInfo,
+        (zeroOut && (scanInfo.posX >= effWidth || scanInfo.posY >= effHeight)),
+        default_quant_coeff,
+        effectWidth,
+        effectHeight,
+        tu.cu->slice->getReverseLastSigCoeffFlag());
+    }
+  }
+
+  //===== find best path =====
+  Decision decision    = {std::numeric_limits<int64_t>::max(), -1, -2};
+  int64_t  minPathCost = 0;
+  for (int8_t stateId = 0; stateId < 4; stateId++) {
+    int64_t pathCost = m_trellis[0][stateId].rdCost;
+    if (pathCost < minPathCost) {
+      decision.prevId = stateId;
+      minPathCost     = pathCost;
+    }
+  }
+
+  //===== backward scanning =====
+  int scanIdx = 0;
+  for (; decision.prevId >= 0; scanIdx++) {
+    decision       = m_trellis[scanIdx][decision.prevId];
+    int32_t blkpos = tuPars.m_scanId2BlkPos[scanIdx].idx;
+    q_coeff[blkpos] =
+      (tCoeff[blkpos] < 0 ? -decision.absLevel : decision.absLevel);
+    absSum += decision.absLevel;
+  }
+}
diff --git a/src/dep_quant.h b/src/dep_quant.h
new file mode 100644
index 00000000..35fec0b5
--- /dev/null
+++ b/src/dep_quant.h
@@ -0,0 +1,40 @@
+/*****************************************************************************
+ * This file is part of uvg266 VVC encoder.
+ *
+ * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ * 
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ * 
+ * * Redistributions in binary form must reproduce the above copyright notice, this
+ *   list of conditions and the following disclaimer in the documentation and/or
+ *   other materials provided with the distribution.
+ * 
+ * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
+ ****************************************************************************/
+
+#ifndef DEP_QUANT_H_
+#define DEP_QUANT_H_
+
+#include "global.h"
+
+
+
+#endif
diff --git a/src/rdo.c b/src/rdo.c
index 93296e0f..17de36bb 100644
--- a/src/rdo.c
+++ b/src/rdo.c
@@ -53,7 +53,6 @@
 #include "strategies/strategies-quant.h"
 
 
-#define QUANT_SHIFT          14
 #define SCAN_SET_SIZE        16
 #define LOG2_SCAN_SET_SIZE    4
 #define SBH_THRESHOLD         4
diff --git a/src/rdo.h b/src/rdo.h
index 2b557651..9aa2d425 100644
--- a/src/rdo.h
+++ b/src/rdo.h
@@ -44,6 +44,8 @@
 #include "global.h" // IWYU pragma: keep
 #include "search_inter.h"
 
+#define QUANT_SHIFT 14
+#define IQUANT_SHIFT 6
 
 extern const uint32_t uvg_g_go_rice_range[5];
 extern const uint32_t uvg_g_go_rice_prefix_len[5];
diff --git a/src/strategies/generic/quant-generic.c b/src/strategies/generic/quant-generic.c
index 8d2a85da..ed30b691 100644
--- a/src/strategies/generic/quant-generic.c
+++ b/src/strategies/generic/quant-generic.c
@@ -44,7 +44,6 @@
 #include "fast_coeff_cost.h"
 #include "reshape.h"
 
-#define QUANT_SHIFT 14
 /**
 * \brief quantize transformed coefficents
 *
diff --git a/src/strategies/generic/quant-generic.h b/src/strategies/generic/quant-generic.h
index ba1fa130..665e0863 100644
--- a/src/strategies/generic/quant-generic.h
+++ b/src/strategies/generic/quant-generic.h
@@ -44,8 +44,6 @@
 #include "uvg266.h"
 #include "tables.h"
 
-#define QUANT_SHIFT 14
-
 int uvg_strategy_register_quant_generic(void* opaque, uint8_t bitdepth);
 void uvg_quant_generic(
   const encoder_state_t * const state,

From 4dbe0cd6c37251d1bfc23b8290e842a87d2c848a Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Mon, 9 Jan 2023 14:10:10 +0200
Subject: [PATCH 181/254] [DepQuant] WIP: easy part done

---
 src/dep_quant.c | 630 +++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 601 insertions(+), 29 deletions(-)

diff --git a/src/dep_quant.c b/src/dep_quant.c
index 47314f48..776d482b 100644
--- a/src/dep_quant.c
+++ b/src/dep_quant.c
@@ -47,7 +47,22 @@
 #define sm_maxNumSigCtx    12
 #define sm_maxNumGtxCtx    21
 #define SCALE_BITS 15
+#define RICEMAX 32
 
+static const int32_t g_goRiceBits[4][RICEMAX] = {
+    { 32768,  65536,  98304, 131072, 163840, 196608, 262144, 262144, 327680, 327680, 327680, 327680, 393216, 393216, 393216, 393216, 393216, 393216, 393216, 393216, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752},
+    { 65536,  65536,  98304,  98304, 131072, 131072, 163840, 163840, 196608, 196608, 229376, 229376, 294912, 294912, 294912, 294912, 360448, 360448, 360448, 360448, 360448, 360448, 360448, 360448, 425984, 425984, 425984, 425984, 425984, 425984, 425984, 425984},
+    { 98304,  98304,  98304,  98304, 131072, 131072, 131072, 131072, 163840, 163840, 163840, 163840, 196608, 196608, 196608, 196608, 229376, 229376, 229376, 229376, 262144, 262144, 262144, 262144, 327680, 327680, 327680, 327680, 327680, 327680, 327680, 327680},
+    {131072, 131072, 131072, 131072, 131072, 131072, 131072, 131072, 163840, 163840, 163840, 163840, 163840, 163840, 163840, 163840, 196608, 196608, 196608, 196608, 196608, 196608, 196608, 196608, 229376, 229376, 229376, 229376, 229376, 229376, 229376, 229376},
+};
+
+static const int g_riceT[4] = { 32,128, 512, 2048 };
+static const int g_riceShift[5] = { 0, 2, 4, 6, 8 };
+
+static const uint32_t g_goRiceParsCoeff[32] = { 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 2,
+                                         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3 };
+
+enum ScanPosType { SCAN_ISCSBB = 0, SCAN_SOCSBB = 1, SCAN_EOCSBB = 2 };
 
 typedef struct {
   int     m_QShift;
@@ -79,6 +94,21 @@ typedef struct {
   uint8_t* levels;
 } SbbCtx;
 
+
+
+typedef struct 
+{
+  coeff_t  absLevel;
+  int64_t deltaDist;
+}PQData;
+
+typedef struct  {
+  int64_t rdCost;
+  coeff_t  absLevel;
+  int     prevId;
+} Decision;
+
+
 typedef struct {
   const NbInfoOut* m_nbInfo;
   uint32_t      m_sbbFlagBits[2][2];
@@ -114,7 +144,7 @@ typedef struct {
   int8_t               m_stateId;
   const uint32_t*       m_sigFracBitsArray;
   const uint32_t*       m_gtxFracBitsArray;
-  common_context*            m_commonCtx;
+  struct common_context*            m_commonCtx;
   
   unsigned effWidth;
   unsigned effHeight;
@@ -317,12 +347,557 @@ static void depquant_state_init(depquant_state* state, uint32_t sig_frac_bits[2]
   state->m_goRiceZero = 0;
 }
 
+static INLINE void checkRdCostSkipSbbZeroOut(Decision *decision, const depquant_state * const state) 
+{
+    int64_t rdCost = state->m_rdCost + state->m_sbbFracBits[0];
+    decision->rdCost = rdCost;
+    decision->absLevel = 0;
+    decision->prevId = 4 + state->m_stateId;
+}
+
+static void checkRdCosts(const depquant_state * const state, const enum ScanPosType spt, const PQData *pqDataA, const PQData *pqDataB, Decision *decisionA, Decision *decisionB)
+{
+    const int32_t* goRiceTab = g_goRiceBits[state->m_goRicePar];
+    int64_t         rdCostA = state->m_rdCost + pqDataA->deltaDist;
+    int64_t         rdCostB = state->m_rdCost + pqDataB->deltaDist;
+    int64_t         rdCostZ = state->m_rdCost;
+    if (state->m_remRegBins >= 4)
+    {
+        if (pqDataA->absLevel < 4)
+        {
+            rdCostA += state->m_coeffFracBits[pqDataA->absLevel];
+        }
+        else
+        {
+            const coeff_t value = (pqDataA->absLevel - 4) >> 1;
+            rdCostA +=
+                state->m_coeffFracBits[pqDataA->absLevel - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1];
+        }
+        if (pqDataB->absLevel < 4)
+        {
+            rdCostB += state->m_coeffFracBits[pqDataB->absLevel];
+        }
+        else
+        {
+            const coeff_t value = (pqDataB->absLevel - 4) >> 1;
+            rdCostB +=
+                state->m_coeffFracBits[pqDataB->absLevel - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1];
+        }
+        if (spt == SCAN_ISCSBB)
+        {
+            rdCostA += state->m_sigFracBits[1];
+            rdCostB += state->m_sigFracBits[1];
+            rdCostZ += state->m_sigFracBits[0];
+        }
+        else if (spt == SCAN_SOCSBB)
+        {
+            rdCostA += state->m_sbbFracBits[1] + state->m_sigFracBits[1];
+            rdCostB += state->m_sbbFracBits[1] + state->m_sigFracBits[1];
+            rdCostZ += state->m_sbbFracBits[1] + state->m_sigFracBits[0];
+        }
+        else if (state->m_numSigSbb)
+        {
+            rdCostA += state->m_sigFracBits[1];
+            rdCostB += state->m_sigFracBits[1];
+            rdCostZ += state->m_sigFracBits[0];
+        }
+        else
+        {
+            rdCostZ = decisionA->rdCost;
+        }
+    }
+    else
+    {
+        rdCostA +=
+            (1 << SCALE_BITS)
+            + goRiceTab[pqDataA->absLevel <= state->m_goRiceZero ? pqDataA->absLevel - 1
+            : (pqDataA->absLevel < RICEMAX ? pqDataA->absLevel : RICEMAX - 1)];
+        rdCostB +=
+            (1 << SCALE_BITS)
+            + goRiceTab[pqDataB->absLevel <= state->m_goRiceZero ? pqDataB->absLevel - 1
+            : (pqDataB->absLevel < RICEMAX ? pqDataB->absLevel : RICEMAX - 1)];
+        rdCostZ += goRiceTab[state->m_goRiceZero];
+    }
+    if (rdCostA < decisionA->rdCost)
+    {
+        decisionA->rdCost = rdCostA;
+        decisionA->absLevel = pqDataA->absLevel;
+        decisionA->prevId = state->m_stateId;
+    }
+    if (rdCostZ < decisionA->rdCost)
+    {
+        decisionA->rdCost = rdCostZ;
+        decisionA->absLevel = 0;
+        decisionA->prevId = state->m_stateId;
+    }
+    if (rdCostB < decisionB->rdCost)
+    {
+        decisionB->rdCost = rdCostB;
+        decisionB->absLevel = pqDataB->absLevel;
+        decisionB->prevId = state->m_stateId;
+    }
+}
+
+static INLINE void checkRdCostSkipSbb(const depquant_state* const state, Decision *decision)
+{
+    int64_t rdCost = state->m_rdCost + state->m_sbbFracBits[0];
+    if (rdCost < decision->rdCost)
+    {
+        decision->rdCost = rdCost;
+        decision->absLevel = 0;
+        decision->prevId = 4 + state->m_stateId;
+    }
+}
+
+static INLINE void checkRdCostStart(const depquant_state* const state, int32_t lastOffset, const PQData *pqData, Decision *decision)
+{
+    int64_t rdCost = pqData->deltaDist + lastOffset;
+    if (pqData->absLevel < 4)
+    {
+        rdCost += state->m_coeffFracBits[pqData->absLevel];
+    }
+    else
+    {
+        const coeff_t value = (pqData->absLevel - 4) >> 1;
+        rdCost += state->m_coeffFracBits[pqData->absLevel - (value << 1)] + g_goRiceBits[state->m_goRicePar][value < RICEMAX ? value : RICEMAX - 1];
+    }
+    if (rdCost < decision->rdCost)
+    {
+        decision->rdCost = rdCost;
+        decision->absLevel = pqData->absLevel;
+        decision->prevId = -1;
+    }
+}
+
+
+static INLINE void preQuantCoeff(const quant_block * const qp, const coeff_t absCoeff, PQData* pqData, coeff_t quanCoeff)
+{
+    int64_t scaledOrg = (int64_t)(absCoeff) * quanCoeff;
+    coeff_t  qIdx = MAX(1, MIN(qp->m_maxQIdx, (coeff_t)((scaledOrg + qp->m_QAdd) >> qp->m_QShift)));
+    int64_t scaledAdd = qIdx * qp->m_DistStepAdd - scaledOrg * qp->m_DistOrgFact;
+    PQData *pq_a = &pqData[qIdx & 3];
+    pq_a->deltaDist = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
+    pq_a->absLevel = (++qIdx) >> 1;
+    scaledAdd += qp->m_DistStepAdd;
+    PQData *pq_b = &pqData[qIdx & 3];
+    pq_b->deltaDist = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
+    pq_b->absLevel = (++qIdx) >> 1;
+    scaledAdd += qp->m_DistStepAdd;
+    PQData *pq_c = &pqData[qIdx & 3];
+    pq_c->deltaDist = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
+    pq_c->absLevel = (++qIdx) >> 1;
+    scaledAdd += qp->m_DistStepAdd;
+    PQData *pq_d = &pqData[qIdx & 3];
+    pq_d->deltaDist = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
+    pq_d->absLevel = (++qIdx) >> 1;
+}
+
+
+#define DINIT(l,p) {INT64_MAX>>2,(l),(p)}
+static const Decision startDec[8] = { DINIT(-1,-2),DINIT(-1,-2),DINIT(-1,-2),DINIT(-1,-2),DINIT(0,4),DINIT(0,5),DINIT(0,6),DINIT(0,7) };
+#undef  DINIT
+
+
+static void xDecide(
+    depquant_state* const m_skipStates,
+    depquant_state* const m_prevStates,
+    depquant_state* const m_startState,
+    quant_block *qp,
+    const enum ScanPosType spt,
+    const coeff_t absCoeff,
+    const int lastOffset,
+    Decision* decisions,
+    bool zeroOut,
+    coeff_t quanCoeff)
+{
+    memcpy(decisions, startDec, 8 * sizeof(Decision));
+
+    if (zeroOut)
+    {
+        if (spt == SCAN_EOCSBB)
+        {
+            checkRdCostSkipSbbZeroOut(&decisions[0], &m_skipStates[0]);
+            checkRdCostSkipSbbZeroOut(&decisions[1], &m_skipStates[1]);
+            checkRdCostSkipSbbZeroOut(&decisions[2], &m_skipStates[2]);
+            checkRdCostSkipSbbZeroOut(&decisions[3], &m_skipStates[3]);
+        }
+        return;
+    }
+
+    PQData  pqData[4];
+    preQuantCoeff(qp, absCoeff, pqData, quanCoeff);
+    checkRdCosts(&m_prevStates[0], spt, &pqData[0], &pqData[2], &decisions[0], &decisions[2]);
+    checkRdCosts(&m_prevStates[1], spt, &pqData[0], &pqData[2], &decisions[2], &decisions[0]);
+    checkRdCosts(&m_prevStates[2], spt, &pqData[3], &pqData[1], &decisions[1], &decisions[3]);
+    checkRdCosts(&m_prevStates[3], spt, &pqData[3], &pqData[1], &decisions[3], &decisions[1]);
+    if (spt == SCAN_EOCSBB)
+    {
+        checkRdCostSkipSbb(&m_skipStates[0], &decisions[0]);
+        checkRdCostSkipSbb(&m_skipStates[1], &decisions[1]);
+        checkRdCostSkipSbb(&m_skipStates[2], &decisions[2]);
+        checkRdCostSkipSbb(&m_skipStates[3], &decisions[3]);
+    }
+
+    checkRdCostStart(m_startState, lastOffset, &pqData[0], &decisions[0]);
+    checkRdCostStart(m_startState, lastOffset, &pqData[2], &decisions[2]);
+}
+
+
+unsigned templateAbsCompare(coeff_t sum)
+{
+    int rangeIdx = 0;
+    if (sum < g_riceT[0])
+    {
+        rangeIdx = 0;
+    }
+    else if (sum < g_riceT[1])
+    {
+        rangeIdx = 1;
+    }
+    else if (sum < g_riceT[2])
+    {
+        rangeIdx = 2;
+    }
+    else if (sum < g_riceT[3])
+    {
+        rangeIdx = 3;
+    }
+    else
+    {
+        rangeIdx = 4;
+    }
+    return g_riceShift[rangeIdx];
+}
+
+static INLINE void update_common_context(common_context * cc, const ScanInfo *scanInfo, const depquant_state* prevState, depquant_state *currState)
+{
+    uint8_t* sbbFlags = cc->m_currSbbCtx[currState->m_stateId].sbbFlags;
+    uint8_t* levels = cc->m_currSbbCtx[currState->m_stateId].levels;
+    size_t setCpSize = cc->m_nbInfo[scanInfo.scanIdx - 1].maxDist * sizeof(uint8_t);
+    if (prevState && prevState->m_refSbbCtxId >= 0)
+    {
+        memcpy(sbbFlags, cc->m_prevSbbCtx[prevState->m_refSbbCtxId].sbbFlags, scanInfo.numSbb * sizeof(uint8_t));
+        memcpy(levels + scanInfo.scanIdx, cc->m_prevSbbCtx[prevState->m_refSbbCtxId].levels + scanInfo.scanIdx, setCpSize);
+    }
+    else
+    {
+        memset(sbbFlags, 0, scanInfo.numSbb * sizeof(uint8_t));
+        memset(levels + scanInfo.scanIdx, 0, setCpSize);
+    }
+    sbbFlags[scanInfo.sbbPos] = !!currState->m_numSigSbb;
+    memcpy(levels + scanInfo.scanIdx, currState->m_absLevelsAndCtxInit, scanInfo.sbbSize * sizeof(uint8_t));
+
+    const int       sigNSbb = ((scanInfo.nextSbbRight ? sbbFlags[scanInfo.nextSbbRight] : false) || (scanInfo.nextSbbBelow ? sbbFlags[scanInfo.nextSbbBelow] : false) ? 1 : 0);
+    currState->m_numSigSbb = 0;
+    if (prevState)
+    {
+        currState->m_remRegBins = prevState->m_remRegBins;
+    }
+    else
+    {
+        int ctxBinSampleRatio = 28; // (scanInfo.chType == COLOR_Y) ? MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_LUMA : MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_CHROMA;
+        currState->m_remRegBins = (currState->effWidth * currState->effHeight * ctxBinSampleRatio) / 16;
+    }
+    currState->m_goRicePar = 0;
+    currState->m_refSbbCtxId = currState->m_stateId;
+    currState->m_sbbFracBits[0] = cc->m_sbbFlagBits[sigNSbb][0];
+    currState->m_sbbFracBits[1] = cc->m_sbbFlagBits[sigNSbb][1];
+
+    uint16_t          templateCtxInit[16];
+    const int         scanBeg = scanInfo.scanIdx - scanInfo.sbbSize;
+    const NbInfoOut* nbOut = cc->m_nbInfo + scanBeg;
+    const uint8_t* absLevels = levels + scanBeg;
+    for (int id = 0; id < scanInfo.sbbSize; id++, nbOut++)
+    {
+        if (nbOut->num)
+        {
+            coeff_t sumAbs = 0, sumAbs1 = 0, sumNum = 0;
+#define UPDATE(k) {coeff_t t=absLevels[nbOut->outPos[k]]; sumAbs+=t; sumAbs1+=MIN(4+(t&1),t); sumNum+=!!t; }
+            UPDATE(0);
+            if (nbOut->num > 1)
+            {
+                UPDATE(1);
+                if (nbOut->num > 2)
+                {
+                    UPDATE(2);
+                    if (nbOut->num > 3)
+                    {
+                        UPDATE(3);
+                        if (nbOut->num > 4)
+                        {
+                            UPDATE(4);
+                        }
+                    }
+                }
+            }
+#undef UPDATE
+            templateCtxInit[id] = (uint16_t)(sumNum) + ((uint16_t)(sumAbs1) << 3) + ((uint16_t)MIN(127, sumAbs) << 8);
+        }
+        else
+        {
+            templateCtxInit[id] = 0;
+        }
+    }
+    memset(currState->m_absLevelsAndCtxInit, 0, 16 * sizeof(uint8_t));
+    memcpy(currState->m_absLevelsAndCtxInit + 8, templateCtxInit, 16 * sizeof(uint16_t));
+}
+
+
+static INLINE void updateStateEOS(depquant_state * state, const ScanInfo  *scanInfo, const depquant_state* prevStates, const depquant_state* skipStates,
+    const Decision *decision)
+{
+    state->m_rdCost = decision->rdCost;
+    if (decision->prevId > -2)
+    {
+        const depquant_state* prvState = 0;
+        if (decision->prevId >= 4)
+        {
+            prvState = skipStates + (decision->prevId - 4);
+            state->m_numSigSbb = 0;
+            memset(state->m_absLevelsAndCtxInit, 0, 16 * sizeof(uint8_t));
+        }
+        else if (decision->prevId >= 0)
+        {
+            prvState = prevStates + decision->prevId;
+            state->m_numSigSbb = prvState->m_numSigSbb + !!decision->absLevel;
+            memcpy(state->m_absLevelsAndCtxInit, prvState->m_absLevelsAndCtxInit, 16 * sizeof(uint8_t));
+        }
+        else
+        {
+            state->m_numSigSbb = 1;
+            memset(state->m_absLevelsAndCtxInit, 0, 16 * sizeof(uint8_t));
+        }
+        reinterpret_cast<uint8_t*>(m_absLevelsAndCtxInit)[scanInfo.insidePos] = (uint8_t)MIN(255, decision->absLevel);
+
+        update_common_context(state->m_commonCtx, scanInfo, prvState, state);
+
+        coeff_t  tinit = state->m_absLevelsAndCtxInit[8 + scanInfo.nextInsidePos];
+        coeff_t  sumNum = tinit & 7;
+        coeff_t  sumAbs1 = (tinit >> 3) & 31;
+        coeff_t  sumGt1 = sumAbs1 - sumNum;
+        state->m_sigFracBits = state->m_sigFracBitsArray[scanInfo.sigCtxOffsetNext + MIN((sumAbs1 + 1) >> 1, 3)];
+        state->m_coeffFracBits = state->m_gtxFracBitsArray[scanInfo.gtxCtxOffsetNext + (sumGt1 < 4 ? sumGt1 : 4)];
+    }
+}
+
+static INLINE void updateState(depquant_state* state, int numIPos, const ScanInfo scanInfo, const depquant_state *prevStates, const Decision *decision, const int baseLevel, const bool extRiceFlag)
+{
+    state->m_rdCost = decision->rdCost;
+    if (decision->prevId > -2)
+    {
+        if (decision->prevId >= 0)
+        {
+            const depquant_state* prvState = prevStates + decision->prevId;
+            state->m_numSigSbb = prvState->m_numSigSbb + !!decision->absLevel;
+            state->m_refSbbCtxId = prvState->m_refSbbCtxId;
+            state->m_sbbFracBits[0] = prvState->m_sbbFracBits[0];
+            state->m_sbbFracBits[1] = prvState->m_sbbFracBits[1];
+            state->m_remRegBins = prvState->m_remRegBins - 1;
+            state->m_goRicePar = prvState->m_goRicePar;
+            if (state->m_remRegBins >= 4)
+            {
+                state->m_remRegBins -= (decision->absLevel < 2 ? (unsigned)decision->absLevel : 3);
+            }
+            memcpy(state->m_absLevelsAndCtxInit, prvState->m_absLevelsAndCtxInit, 48 * sizeof(uint8_t));
+        }
+        else
+        {
+            state->m_numSigSbb = 1;
+            state->m_refSbbCtxId = -1;
+            int ctxBinSampleRatio = 28; //(scanInfo.chType == CHANNEL_TYPE_LUMA) ? MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_LUMA : MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_CHROMA;
+            state->m_remRegBins = (state->effWidth * state->effHeight * ctxBinSampleRatio) / 16 - (decision->absLevel < 2 ? (unsigned)decision->absLevel : 3);
+            memset(state->m_absLevelsAndCtxInit, 0, 48 * sizeof(uint8_t));
+        }
+
+        uint8_t* levels = (uint8_t*)(state->m_absLevelsAndCtxInit);
+        levels[scanInfo.insidePos] = (uint8_t)MIN(255, decision->absLevel);
+
+        if (state->m_remRegBins >= 4)
+        {
+            coeff_t  tinit = state->m_absLevelsAndCtxInit[8 + scanInfo.nextInsidePos];
+            coeff_t  sumAbs1 = (tinit >> 3) & 31;
+            coeff_t sumNum = tinit & 7;
+#define UPDATE(k) {coeff_t t=levels[scanInfo.nextNbInfoSbb.inPos[k]]; sumAbs1+=MIN(4+(t&1),t); sumNum+=!!t; }
+            if (numIPos == 1)
+            {
+                UPDATE(0);
+            }
+            else if (numIPos == 2)
+            {
+                UPDATE(0);
+                UPDATE(1);
+            }
+            else if (numIPos == 3)
+            {
+                UPDATE(0);
+                UPDATE(1);
+                UPDATE(2);
+            }
+            else if (numIPos == 4)
+            {
+                UPDATE(0);
+                UPDATE(1);
+                UPDATE(2);
+                UPDATE(3);
+            }
+            else if (numIPos == 5)
+            {
+                UPDATE(0);
+                UPDATE(1);
+                UPDATE(2);
+                UPDATE(3);
+                UPDATE(4);
+            }
+#undef UPDATE
+            coeff_t sumGt1 = sumAbs1 - sumNum;
+            state->m_sigFracBits[0] = state->m_sigFracBitsArray[scanInfo.sigCtxOffsetNext + MIN((sumAbs1 + 1) >> 1, 3)][0];
+            state->m_sigFracBits[1] = state->m_sigFracBitsArray[scanInfo.sigCtxOffsetNext + MIN((sumAbs1 + 1) >> 1, 3)][1];
+            memcpy(state->m_coeffFracBits, &state->m_gtxFracBitsArray[scanInfo.gtxCtxOffsetNext + (sumGt1 < 4 ? sumGt1 : 4)], sizeof(state->m_coeffFracBits));
+            
+
+            coeff_t  sumAbs = state->m_absLevelsAndCtxInit[8 + scanInfo.nextInsidePos] >> 8;
+#define UPDATE(k) {coeff_t t=levels[scanInfo.nextNbInfoSbb.inPos[k]]; sumAbs+=t; }
+            if (numIPos == 1)
+            {
+                UPDATE(0);
+            }
+            else if (numIPos == 2)
+            {
+                UPDATE(0);
+                UPDATE(1);
+            }
+            else if (numIPos == 3)
+            {
+                UPDATE(0);
+                UPDATE(1);
+                UPDATE(2);
+            }
+            else if (numIPos == 4)
+            {
+                UPDATE(0);
+                UPDATE(1);
+                UPDATE(2);
+                UPDATE(3);
+            }
+            else if (numIPos == 5)
+            {
+                UPDATE(0);
+                UPDATE(1);
+                UPDATE(2);
+                UPDATE(3);
+                UPDATE(4);
+            }
+#undef UPDATE
+            if (extRiceFlag)
+            {
+                unsigned currentShift = templateAbsCompare(sumAbs);
+                sumAbs = sumAbs >> currentShift;
+                int sumAll = MAX(MIN(31, (int)sumAbs - (int)baseLevel), 0);
+                state->m_goRicePar = g_goRiceParsCoeff[sumAll];
+                state->m_goRicePar += currentShift;
+            }
+            else
+            {
+                int sumAll = MAX(MIN(31, (int)sumAbs - 4 * 5), 0);
+                state->m_goRicePar = g_goRiceParsCoeff[sumAll];
+            }
+        }
+        else
+        {
+            coeff_t  sumAbs = state->m_absLevelsAndCtxInit[8 + scanInfo.nextInsidePos] >> 8;
+#define UPDATE(k) {coeff_t t=levels[scanInfo.nextNbInfoSbb.inPos[k]]; sumAbs+=t; }
+            if (numIPos == 1)
+            {
+                UPDATE(0);
+            }
+            else if (numIPos == 2)
+            {
+                UPDATE(0);
+                UPDATE(1);
+            }
+            else if (numIPos == 3)
+            {
+                UPDATE(0);
+                UPDATE(1);
+                UPDATE(2);
+            }
+            else if (numIPos == 4)
+            {
+                UPDATE(0);
+                UPDATE(1);
+                UPDATE(2);
+                UPDATE(3);
+            }
+            else if (numIPos == 5)
+            {
+                UPDATE(0);
+                UPDATE(1);
+                UPDATE(2);
+                UPDATE(3);
+                UPDATE(4);
+            }
+#undef UPDATE
+            if (extRiceFlag)
+            {
+                unsigned currentShift = templateAbsCompare(sumAbs);
+                sumAbs = sumAbs >> currentShift;
+                sumAbs = MIN(31, sumAbs);
+                state->m_goRicePar = g_goRiceParsCoeff[sumAbs];
+                state->m_goRicePar += currentShift;
+            }
+            else
+            {
+                sumAbs = MIN(31, sumAbs);
+                state->m_goRicePar = g_goRiceParsCoeff[sumAbs];
+            }
+            state->m_goRiceZero = (state->m_stateId < 2 ? 1 : 2) << state->m_goRicePar; 
+        }
+    }
+}
+
+static void xDecideAndUpdate(
+    const coeff_t absCoeff,
+    const ScanInfo scanInfo, 
+    bool zeroOut,
+    coeff_t quantCoeff,
+    int effWidth,
+    int effHeight, 
+    bool reverseLast,
+    Decision* decisions)
+{
+  std::swap(m_prevStates, m_currStates);
+
+  xDecide(scanInfo.spt, absCoeff, lastOffset(scanInfo.scanIdx, effWidth, effHeight, reverseLast), decisions, zeroOut, quantCoeff);
+
+  if (scanInfo.scanIdx) {
+    if (scanInfo.eosbb) {
+      m_commonCtx.swap();
+      updateStateEOS(&m_currStates[0], scanInfo, m_prevStates, m_skipStates, &decisions[0]);
+      updateStateEOS(&m_currStates[1], scanInfo, m_prevStates, m_skipStates, &decisions[1]);
+      updateStateEOS(&m_currStates[2], scanInfo, m_prevStates, m_skipStates, &decisions[2]);
+      updateStateEOS(&m_currStates[3], scanInfo, m_prevStates, m_skipStates, &decisions[3]);
+      memcpy(decisions + 4, decisions, 4 * sizeof(Decision));
+    } else if (!zeroOut) {
+
+      updateState(&m_currStates[0], scanInfo.nextNbInfoSbb.num, scanInfo, m_prevStates, decisions[0], m_baseLevel, m_extRiceRRCFlag);
+      updateState(&m_currStates[1], scanInfo.nextNbInfoSbb.num, scanInfo, m_prevStates, decisions[1], m_baseLevel, m_extRiceRRCFlag);
+      updateState(&m_currStates[2], scanInfo.nextNbInfoSbb.num, scanInfo, m_prevStates, decisions[2], m_baseLevel, m_extRiceRRCFlag);
+      updateState(&m_currStates[3], scanInfo.nextNbInfoSbb.num, scanInfo, m_prevStates, decisions[3], m_baseLevel, m_extRiceRRCFlag);
+    }
+
+    if (scanInfo.spt == SCAN_SOCSBB) {
+      std::swap(m_prevStates, m_skipStates);
+    }
+  }
+}
+
+
 uint8_t uvg_dep_quant(
   const encoder_state_t* const state,
   const cu_info_t* const cur_tu,
   const cu_loc_t* const cu_loc,
   const coeff_t* srcCoeff,
-  const coeff_t* coeff_out,
+  coeff_t* coeff_out,
   const color_t compID,
   enum uvg_tree_type tree_type,
   const double lambda,
@@ -365,6 +940,7 @@ uint8_t uvg_dep_quant(
   quant_block   quant_block;
   init_quant_block(state, &quant_block, cur_tu, log2_tr_width, log2_tr_height, compID, needs_block_size_trafo_scale, -1);
 
+  Decision trellis[TR_MAX_WIDTH * TR_MAX_WIDTH][8];
   //===== scaling matrix ====
   //const int         qpDQ = cQP.Qp + 1;
   //const int         qpPer = qpDQ / 6;
@@ -389,14 +965,13 @@ uint8_t uvg_dep_quant(
   if (
     lfnstIdx > 0 && !is_ts && width >= 4 &&
     height >= 4) {
-    firstTestPos =
-      ((width == 4 && height == 4) || (width == 8 && height == 8)) ? 7 : 15;
+    firstTestPos =((width == 4 && height == 4) || (width == 8 && height == 8)) ? 7 : 15;
   }
   const int32_t default_quant_coeff = uvg_g_quant_scales[needs_block_size_trafo_scale][qp_scaled % 6];
   const coeff_t thres                          = 4 << q_bits;
   for (; firstTestPos >= 0; firstTestPos--) {
-    coeff_t thresTmp = (enableScalingLists) ? (thres / (4 * q_coeff[firstTestPos])) :(thres / (4 * default_quant_coeff));
-    if (abs(srcCoeff[firstTestPos]) > thresTmp) {
+    coeff_t thresTmp = (enableScalingLists) ? (thres / (4 * q_coeff[scan[firstTestPos]])) :(thres / (4 * default_quant_coeff));
+    if (abs(srcCoeff[scan[firstTestPos]]) > thresTmp) {
       break;
     }
   }
@@ -414,50 +989,48 @@ uint8_t uvg_dep_quant(
   depquant_state start_state;
 
 
+  int effectHeight = MIN(32, effHeight);
+  int effectWidth = MIN(32, effWidth);
   for (int k = 0; k < 12; k++) {
     depquant_state_init(&all_state[k], rate_estimator.m_sigFracBits[0][0], rate_estimator.m_gtxFracBits[0]);
-    all_state[k].effHeight = MIN(32, effHeight);
-    all_state[k].effWidth = MIN(32, effWidth);
+    all_state[k].effHeight = effectHeight;
+    all_state[k].effWidth = effectWidth;
   }
   depquant_state_init(&start_state, rate_estimator.m_sigFracBits[0][0], rate_estimator.m_gtxFracBits[0]);
-  start_state.effHeight = MIN(32, effHeight);
-  start_state.effWidth = MIN(32, effWidth);
+  start_state.effHeight = effectHeight;
+  start_state.effWidth = effectWidth;
   
   //===== populate trellis =====
   for (int scanIdx = firstTestPos; scanIdx >= 0; scanIdx--) {
-    const ScanInfo& scanInfo = tuPars.m_scanInfo[scanIdx];
+    uint32_t scan_pos = scan[scanIdx];
     if (enableScalingLists) {
-      m_quant.initQuantBlock(
-        tu,
-        compID,
-        cQP,
-        lambda,
-        quantCoeff[scanInfo.rasterPos]);
+      init_quant_block(state, &quant_block, cur_tu, log2_tr_width, log2_tr_height, compID, needs_block_size_trafo_scale, q_coeff[scan_pos]);
+
       xDecideAndUpdate(
-        abs(tCoeff[scanInfo.rasterPos]),
+        abs(srcCoeff[scan_pos]),
         scanInfo,
         (zeroOut && (scanInfo.posX >= effWidth || scanInfo.posY >= effHeight)),
-        quantCoeff[scanInfo.rasterPos],
+        q_coeff[scan_pos],
         effectWidth,
         effectHeight,
-        tu.cu->slice->getReverseLastSigCoeffFlag());
+        false); //tu.cu->slice->getReverseLastSigCoeffFlag());
     } else {
       xDecideAndUpdate(
-        abs(tCoeff[scanInfo.rasterPos]),
+        abs(srcCoeff[scan_pos]),
         scanInfo,
         (zeroOut && (scanInfo.posX >= effWidth || scanInfo.posY >= effHeight)),
         default_quant_coeff,
         effectWidth,
         effectHeight,
-        tu.cu->slice->getReverseLastSigCoeffFlag());
-    }
+        false); //tu.cu->slice->getReverseLastSigCoeffFlag());
+      }
   }
 
   //===== find best path =====
-  Decision decision    = {std::numeric_limits<int64_t>::max(), -1, -2};
+  Decision decision    = {INT64_MAX, -1, -2};
   int64_t  minPathCost = 0;
   for (int8_t stateId = 0; stateId < 4; stateId++) {
-    int64_t pathCost = m_trellis[0][stateId].rdCost;
+    int64_t pathCost = trellis[0][stateId].rdCost;
     if (pathCost < minPathCost) {
       decision.prevId = stateId;
       minPathCost     = pathCost;
@@ -467,10 +1040,9 @@ uint8_t uvg_dep_quant(
   //===== backward scanning =====
   int scanIdx = 0;
   for (; decision.prevId >= 0; scanIdx++) {
-    decision       = m_trellis[scanIdx][decision.prevId];
-    int32_t blkpos = tuPars.m_scanId2BlkPos[scanIdx].idx;
-    q_coeff[blkpos] =
-      (tCoeff[blkpos] < 0 ? -decision.absLevel : decision.absLevel);
+    decision       = trellis[scanIdx][decision.prevId];
+    int32_t blkpos = scan[scanIdx];
+    coeff_out[blkpos] = (srcCoeff[blkpos] < 0 ? -decision.absLevel : decision.absLevel);
     absSum += decision.absLevel;
   }
 }

From 3e66a897d490da5c297197fd326c79a58d037be9 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Tue, 10 Jan 2023 15:32:07 +0200
Subject: [PATCH 182/254] [DepQuant] WIP: compiles

---
 src/dep_quant.c | 668 +++++++++++++++++++++++++++++++++---------------
 src/dep_quant.h |  16 ++
 src/encoder.c   |   7 +
 src/encoder.h   |   4 +
 src/rdo.c       |   2 +-
 src/uvg266.h    |   2 +-
 6 files changed, 484 insertions(+), 215 deletions(-)

diff --git a/src/dep_quant.c b/src/dep_quant.c
index 776d482b..ff9f62be 100644
--- a/src/dep_quant.c
+++ b/src/dep_quant.c
@@ -64,92 +64,258 @@ static const uint32_t g_goRiceParsCoeff[32] = { 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
 
 enum ScanPosType { SCAN_ISCSBB = 0, SCAN_SOCSBB = 1, SCAN_EOCSBB = 2 };
 
-typedef struct {
-  int     m_QShift;
+typedef struct
+{
+  int m_QShift;
   int64_t m_QAdd;
   int64_t m_QScale;
-  coeff_t  m_maxQIdx;
+  coeff_t m_maxQIdx;
   coeff_t m_thresLast;
-  coeff_t  m_thresSSbb;
+  coeff_t m_thresSSbb;
   // distortion normalization
-  int     m_DistShift;
+  int m_DistShift;
   int64_t m_DistAdd;
   int64_t m_DistStepAdd;
   int64_t m_DistOrgFact;
 } quant_block;
 
-typedef struct {
-  uint8_t num;
-  uint8_t inPos[5];
-} NbInfoSbb;
 
-typedef struct {
-  uint16_t maxDist;
-  uint16_t num;
-  uint16_t outPos[5];
-} NbInfoOut;
-
-typedef struct {
+typedef struct
+{
   uint8_t* sbbFlags;
   uint8_t* levels;
 } SbbCtx;
 
 
-
-typedef struct 
+typedef struct
 {
-  coeff_t  absLevel;
+  coeff_t absLevel;
   int64_t deltaDist;
-}PQData;
+} PQData;
 
-typedef struct  {
+typedef struct
+{
   int64_t rdCost;
-  coeff_t  absLevel;
-  int     prevId;
+  coeff_t absLevel;
+  int prevId;
 } Decision;
 
 
-typedef struct {
+typedef struct
+{
   const NbInfoOut* m_nbInfo;
-  uint32_t      m_sbbFlagBits[2][2];
-  SbbCtx           m_allSbbCtx[8];
-  SbbCtx*          m_currSbbCtx;
-  SbbCtx*          m_prevSbbCtx;
-  uint8_t          m_memory[8 * (TR_MAX_WIDTH * TR_MAX_WIDTH + 1024)];
+  uint32_t m_sbbFlagBits[2][2];
+  SbbCtx m_allSbbCtx[8];
+  SbbCtx* m_currSbbCtx;
+  SbbCtx* m_prevSbbCtx;
+  uint8_t m_memory[8 * (TR_MAX_WIDTH * TR_MAX_WIDTH + 1024)];
 } common_context;
 
 
-typedef struct 
+typedef struct
 {
-  int32_t            m_lastBitsX[TR_MAX_WIDTH];
-  int32_t               m_lastBitsY[TR_MAX_WIDTH];
-  uint32_t        m_sigSbbFracBits[sm_maxNumSigSbbCtx][2];
-  uint32_t        m_sigFracBits[sm_numCtxSetsSig][sm_maxNumSigCtx][2];
-  int32_t      m_gtxFracBits[sm_maxNumGtxCtx][6];
-  
+  int32_t m_lastBitsX[TR_MAX_WIDTH];
+  int32_t m_lastBitsY[TR_MAX_WIDTH];
+  uint32_t m_sigSbbFracBits[sm_maxNumSigSbbCtx][2];
+  uint32_t m_sigFracBits[sm_numCtxSetsSig][sm_maxNumSigCtx][2];
+  int32_t m_gtxFracBits[sm_maxNumGtxCtx][6];
 } rate_estimator;
 
 
-typedef struct {
-  int64_t                    m_rdCost;
-  uint16_t                   m_absLevelsAndCtxInit[24]; // 16x8bit for abs levels + 16x16bit for ctx init id
-  int8_t                     m_numSigSbb;
-  int                        m_remRegBins;
-  int8_t                     m_refSbbCtxId;
-  uint32_t                   m_sbbFracBits[2];
-  uint32_t                   m_sigFracBits[2];
-  int32_t                   m_coeffFracBits[6];
-  int8_t                     m_goRicePar;
-  int8_t                     m_goRiceZero;
-  int8_t               m_stateId;
-  const uint32_t*       m_sigFracBitsArray;
-  const uint32_t*       m_gtxFracBitsArray;
-  struct common_context*            m_commonCtx;
-  
+typedef struct
+{
+  int64_t m_rdCost;
+  uint16_t m_absLevelsAndCtxInit[24]; // 16x8bit for abs levels + 16x16bit for ctx init id
+  int8_t m_numSigSbb;
+  int m_remRegBins;
+  int8_t m_refSbbCtxId;
+  uint32_t m_sbbFracBits[2];
+  uint32_t m_sigFracBits[2];
+  int32_t m_coeffFracBits[6];
+  int8_t m_goRicePar;
+  int8_t m_goRiceZero;
+  int8_t m_stateId;
+  const uint32_t* m_sigFracBitsArray[2];
+  const uint32_t* m_gtxFracBitsArray[6];
+  struct common_context* m_commonCtx;
+
   unsigned effWidth;
   unsigned effHeight;
 } depquant_state;
 
+typedef struct
+{
+    common_context   m_common_context;
+    depquant_state       m_allStates[12];
+    depquant_state* m_currStates;
+    depquant_state* m_prevStates;
+    depquant_state* m_skipStates;
+    depquant_state       m_startState;
+    quant_block   m_quant;
+    Decision    m_trellis[TR_MAX_WIDTH * TR_MAX_WIDTH][8];
+} context_store;
+
+
+int uvg_init_nb_info(encoder_control_t * encoder) {
+  memset(encoder->m_scanId2NbInfoSbbArray, 0, sizeof(encoder->m_scanId2NbInfoSbbArray));
+  memset(encoder->m_scanId2NbInfoOutArray, 0, sizeof(encoder->m_scanId2NbInfoOutArray));
+  for (int hd = 0; hd <= 7; hd++)
+  {
+
+    uint32_t raster2id[64 * 64] = {0};
+
+    for (int vd = 0; vd <= 7; vd++)
+    {
+      if ((hd == 0 && vd <= 1) || (hd <= 1 && vd == 0))
+      {
+        continue;
+      }
+      const uint32_t      blockWidth = (1 << hd);
+      const uint32_t      blockHeight = (1 << vd);
+      const uint32_t      log2CGWidth = g_log2_sbb_size[hd][vd][0];
+      const uint32_t      log2CGHeight = g_log2_sbb_size[hd][vd][1];
+      const uint32_t      groupWidth = 1 << log2CGWidth;
+      const uint32_t      groupHeight = 1 << log2CGHeight;
+      const uint32_t      groupSize = groupWidth * groupHeight;
+      const int           scanType = SCAN_DIAG;
+      const uint32_t      blkWidthIdx = hd;
+      const uint32_t      blkHeightIdx = vd;
+      const uint32_t* scanId2RP = uvg_get_scan_order_table(SCAN_GROUP_4X4, scanType, blkWidthIdx, blkHeightIdx);
+      NbInfoSbb** sId2NbSbb = &encoder->m_scanId2NbInfoSbbArray[hd][vd];
+      NbInfoOut** sId2NbOut = &encoder->m_scanId2NbInfoOutArray[hd][vd];
+      // consider only non-zero-out region
+      const uint32_t      blkWidthNZOut = MIN(32, blockWidth);
+      const uint32_t      blkHeightNZOut = MIN(32, blockHeight);
+      const uint32_t      totalValues = blkWidthNZOut * blkHeightNZOut;
+
+      *sId2NbSbb = MALLOC(NbInfoSbb, totalValues);
+      if (*sId2NbSbb == NULL) {
+        return 0;
+      }
+      *sId2NbOut = MALLOC(NbInfoOut, totalValues);
+      if (*sId2NbOut == NULL) {
+        return 0;
+      }
+
+      for (uint32_t scanId = 0; scanId < totalValues; scanId++)
+      {
+        raster2id[scanId2RP[scanId]] = scanId;
+      }
+
+      for (unsigned scanId = 0; scanId < totalValues; scanId++)
+      {
+        const int rpos = scanId2RP[scanId];
+        uint32_t  pos_y = rpos >> hd;
+        uint32_t  pos_x = rpos - (pos_y << hd); // TODO: height
+        {
+          //===== inside subband neighbours =====
+          NbInfoSbb *nbSbb = &(*sId2NbSbb)[scanId];
+          const int      begSbb = scanId - (scanId & (groupSize - 1)); // first pos in current subblock
+          int            cpos[5];
+
+          cpos[0] = (pos_x + 1 < blkWidthNZOut ? (raster2id[rpos + 1] < groupSize + begSbb ? raster2id[rpos + 1] - begSbb : 0) : 0);
+          cpos[1] = (pos_x + 2 < blkWidthNZOut ? (raster2id[rpos + 2] < groupSize + begSbb ? raster2id[rpos + 2] - begSbb : 0) : 0);
+          cpos[2] = (pos_x + 1 < blkWidthNZOut && pos_y + 1 < blkHeightNZOut ? (raster2id[rpos + 1 + blockWidth] < groupSize + begSbb ? raster2id[rpos + 1 + blockWidth] - begSbb : 0) : 0);
+          cpos[3] = (pos_y + 1 < blkHeightNZOut ? (raster2id[rpos + blockWidth] < groupSize + begSbb ? raster2id[rpos + blockWidth] - begSbb : 0) : 0);
+          cpos[4] = (pos_y + 2 < blkHeightNZOut ? (raster2id[rpos + 2 * blockWidth] < groupSize + begSbb ? raster2id[rpos + 2 * blockWidth] - begSbb : 0) : 0);
+
+          for (nbSbb->num = 0; true; )
+          {
+            int nk = -1;
+            for (int k = 0; k < 5; k++)
+            {
+              if (cpos[k] != 0 && (nk < 0 || cpos[k] < cpos[nk]))
+              {
+                nk = k;
+              }
+            }
+            if (nk < 0)
+            {
+              break;
+            }
+            nbSbb->inPos[nbSbb->num++] = (uint8_t)(cpos[nk]);
+            cpos[nk] = 0;
+          }
+          for (int k = nbSbb->num; k < 5; k++)
+          {
+            nbSbb->inPos[k] = 0;
+          }
+        }
+        {
+          //===== outside subband neighbours =====
+          NbInfoOut *nbOut = &(*sId2NbOut)[scanId];
+          const int      begSbb = scanId - (scanId & (groupSize - 1)); // first pos in current subblock
+          int            cpos[5];
+
+          cpos[0] = (pos_x + 1 < blkWidthNZOut ? (raster2id[rpos + 1] >= groupSize + begSbb ? raster2id[rpos + 1] : 0) : 0);
+          cpos[1] = (pos_x + 2 < blkWidthNZOut ? (raster2id[rpos + 2] >= groupSize + begSbb ? raster2id[rpos + 2] : 0) : 0);
+          cpos[2] = (pos_x + 1 < blkWidthNZOut && pos_y + 1 < blkHeightNZOut ? (raster2id[rpos + 1 + blockWidth] >= groupSize + begSbb ? raster2id[rpos + 1 + blockWidth] : 0) : 0);
+          cpos[3] = (pos_y + 1 < blkHeightNZOut ? (raster2id[rpos + blockWidth] >= groupSize + begSbb ? raster2id[rpos + blockWidth] : 0) : 0);
+          cpos[4] = (pos_y + 2 < blkHeightNZOut ? (raster2id[rpos + 2 * blockWidth] >= groupSize + begSbb ? raster2id[rpos + 2 * blockWidth] : 0) : 0);
+
+          for (nbOut->num = 0; true; )
+          {
+            int nk = -1;
+            for (int k = 0; k < 5; k++)
+            {
+              if (cpos[k] != 0 && (nk < 0 || cpos[k] < cpos[nk]))
+              {
+                nk = k;
+              }
+            }
+            if (nk < 0)
+            {
+              break;
+            }
+            nbOut->outPos[nbOut->num++] = (uint16_t)(cpos[nk]);
+            cpos[nk] = 0;
+          }
+          for (int k = nbOut->num; k < 5; k++)
+          {
+            nbOut->outPos[k] = 0;
+          }
+          nbOut->maxDist = (scanId == 0 ? 0 : (*sId2NbOut)[scanId - 1].maxDist);
+          for (int k = 0; k < nbOut->num; k++)
+          {
+            if (nbOut->outPos[k] > nbOut->maxDist)
+            {
+              nbOut->maxDist = nbOut->outPos[k];
+            }
+          }
+        }
+      }
+
+      // make it relative
+      for (unsigned scanId = 0; scanId < totalValues; scanId++)
+      {
+        NbInfoOut *nbOut = &(*sId2NbOut)[scanId];
+        const int  begSbb = scanId - (scanId & (groupSize - 1)); // first pos in current subblock
+        for (int k = 0; k < nbOut->num; k++)
+        {
+          nbOut->outPos[k] -= begSbb;
+        }
+        nbOut->maxDist -= scanId;
+      }
+    }
+  }
+  return 1;
+}
+
+void uvg_dealloc_nb_info(encoder_control_t* encoder) {
+
+  for (int hd = 0; hd <= 7; hd++) {
+    for (int vd = 0; vd <= 7; vd++)
+    {
+      if ((hd == 0 && vd <= 1) || (hd <= 1 && vd == 0))
+      {
+        continue;
+      }
+      if(encoder->m_scanId2NbInfoOutArray[hd][vd]) FREE_POINTER(encoder->m_scanId2NbInfoOutArray[hd][vd]);
+      if(encoder->m_scanId2NbInfoOutArray[hd][vd]) FREE_POINTER(encoder->m_scanId2NbInfoSbbArray[hd][vd]);
+    }
+  }
+}
+
 
 static void init_quant_block(
   const encoder_state_t* state,
@@ -207,7 +373,7 @@ static void init_quant_block(
 
 static void reset_common_context(common_context* ctx, const rate_estimator * rate_estimator, int numSbb, int num_coeff)
 {
-  memset(&ctx->m_nbInfo, 0, sizeof(ctx->m_nbInfo));
+  //memset(&ctx->m_nbInfo, 0, sizeof(ctx->m_nbInfo));
   memcpy(&ctx->m_sbbFlagBits, &rate_estimator->m_sigSbbFracBits, sizeof(rate_estimator->m_sigSbbFracBits));
   const int chunkSize = numSbb + num_coeff;
   uint8_t*  nextMem   = ctx->m_memory;
@@ -215,6 +381,8 @@ static void reset_common_context(common_context* ctx, const rate_estimator * rat
     ctx->m_allSbbCtx[k].sbbFlags = nextMem;
     ctx->m_allSbbCtx[k].levels   = nextMem + numSbb;
   }
+  ctx->m_currSbbCtx = &ctx->m_allSbbCtx[0];
+  ctx->m_prevSbbCtx = &ctx->m_allSbbCtx[4];
 }
 
 static void init_rate_esimator(rate_estimator * rate_estimator, const cabac_data_t * const ctx, color_t color)
@@ -569,119 +737,136 @@ unsigned templateAbsCompare(coeff_t sum)
     return g_riceShift[rangeIdx];
 }
 
-static INLINE void update_common_context(common_context * cc, const ScanInfo *scanInfo, const depquant_state* prevState, depquant_state *currState)
+static INLINE void update_common_context(
+  common_context * cc,
+  const uint32_t scan_pos,
+  const uint32_t width_in_sbb,
+  const uint32_t height_in_sbb,
+  const int sigNSbb,
+  const depquant_state* prevState,
+  depquant_state *currState)
 {
-    uint8_t* sbbFlags = cc->m_currSbbCtx[currState->m_stateId].sbbFlags;
-    uint8_t* levels = cc->m_currSbbCtx[currState->m_stateId].levels;
-    size_t setCpSize = cc->m_nbInfo[scanInfo.scanIdx - 1].maxDist * sizeof(uint8_t);
-    if (prevState && prevState->m_refSbbCtxId >= 0)
-    {
-        memcpy(sbbFlags, cc->m_prevSbbCtx[prevState->m_refSbbCtxId].sbbFlags, scanInfo.numSbb * sizeof(uint8_t));
-        memcpy(levels + scanInfo.scanIdx, cc->m_prevSbbCtx[prevState->m_refSbbCtxId].levels + scanInfo.scanIdx, setCpSize);
-    }
-    else
-    {
-        memset(sbbFlags, 0, scanInfo.numSbb * sizeof(uint8_t));
-        memset(levels + scanInfo.scanIdx, 0, setCpSize);
-    }
-    sbbFlags[scanInfo.sbbPos] = !!currState->m_numSigSbb;
-    memcpy(levels + scanInfo.scanIdx, currState->m_absLevelsAndCtxInit, scanInfo.sbbSize * sizeof(uint8_t));
+  const uint32_t numSbb = width_in_sbb * height_in_sbb;
+  uint8_t* sbbFlags = cc->m_currSbbCtx[currState->m_stateId].sbbFlags;
+  uint8_t* levels = cc->m_currSbbCtx[currState->m_stateId].levels;
+  size_t setCpSize = cc->m_nbInfo[scan_pos - 1].maxDist * sizeof(uint8_t);
+  if (prevState && prevState->m_refSbbCtxId >= 0) {
+    memcpy(sbbFlags, cc->m_prevSbbCtx[prevState->m_refSbbCtxId].sbbFlags, numSbb * sizeof(uint8_t));
+    memcpy(levels + scan_pos, cc->m_prevSbbCtx[prevState->m_refSbbCtxId].levels + scan_pos, setCpSize);
+  }
+  else {
+    memset(sbbFlags, 0, numSbb * sizeof(uint8_t));
+    memset(levels + scan_pos, 0, setCpSize);
+  }
+  sbbFlags[scan_pos >> 4] = !!currState->m_numSigSbb;
+  memcpy(levels + scan_pos, currState->m_absLevelsAndCtxInit, 16 * sizeof(uint8_t));
 
-    const int       sigNSbb = ((scanInfo.nextSbbRight ? sbbFlags[scanInfo.nextSbbRight] : false) || (scanInfo.nextSbbBelow ? sbbFlags[scanInfo.nextSbbBelow] : false) ? 1 : 0);
-    currState->m_numSigSbb = 0;
-    if (prevState)
-    {
-        currState->m_remRegBins = prevState->m_remRegBins;
-    }
-    else
-    {
-        int ctxBinSampleRatio = 28; // (scanInfo.chType == COLOR_Y) ? MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_LUMA : MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_CHROMA;
-        currState->m_remRegBins = (currState->effWidth * currState->effHeight * ctxBinSampleRatio) / 16;
-    }
-    currState->m_goRicePar = 0;
-    currState->m_refSbbCtxId = currState->m_stateId;
-    currState->m_sbbFracBits[0] = cc->m_sbbFlagBits[sigNSbb][0];
-    currState->m_sbbFracBits[1] = cc->m_sbbFlagBits[sigNSbb][1];
+  currState->m_numSigSbb = 0;
+  if (prevState) {
+    currState->m_remRegBins = prevState->m_remRegBins;
+  }
+  else {
+    int ctxBinSampleRatio = 28;
+    // (scanInfo.chType == COLOR_Y) ? MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_LUMA : MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_CHROMA;
+    currState->m_remRegBins = (currState->effWidth * currState->effHeight * ctxBinSampleRatio) / 16;
+  }
+  currState->m_goRicePar = 0;
+  currState->m_refSbbCtxId = currState->m_stateId;
+  currState->m_sbbFracBits[0] = cc->m_sbbFlagBits[sigNSbb][0];
+  currState->m_sbbFracBits[1] = cc->m_sbbFlagBits[sigNSbb][1];
 
-    uint16_t          templateCtxInit[16];
-    const int         scanBeg = scanInfo.scanIdx - scanInfo.sbbSize;
-    const NbInfoOut* nbOut = cc->m_nbInfo + scanBeg;
-    const uint8_t* absLevels = levels + scanBeg;
-    for (int id = 0; id < scanInfo.sbbSize; id++, nbOut++)
-    {
-        if (nbOut->num)
-        {
-            coeff_t sumAbs = 0, sumAbs1 = 0, sumNum = 0;
+  uint16_t templateCtxInit[16];
+  const int scanBeg = scan_pos - 16;
+  const NbInfoOut* nbOut = cc->m_nbInfo + scanBeg;
+  const uint8_t* absLevels = levels + scanBeg;
+  for (int id = 0; id < 16; id++, nbOut++) {
+    if (nbOut->num) {
+      coeff_t sumAbs = 0, sumAbs1 = 0, sumNum = 0;
 #define UPDATE(k) {coeff_t t=absLevels[nbOut->outPos[k]]; sumAbs+=t; sumAbs1+=MIN(4+(t&1),t); sumNum+=!!t; }
-            UPDATE(0);
-            if (nbOut->num > 1)
-            {
-                UPDATE(1);
-                if (nbOut->num > 2)
-                {
-                    UPDATE(2);
-                    if (nbOut->num > 3)
-                    {
-                        UPDATE(3);
-                        if (nbOut->num > 4)
-                        {
-                            UPDATE(4);
-                        }
-                    }
-                }
+      UPDATE(0);
+      if (nbOut->num > 1) {
+        UPDATE(1);
+        if (nbOut->num > 2) {
+          UPDATE(2);
+          if (nbOut->num > 3) {
+            UPDATE(3);
+            if (nbOut->num > 4) {
+              UPDATE(4);
             }
+          }
+        }
+      }
 #undef UPDATE
-            templateCtxInit[id] = (uint16_t)(sumNum) + ((uint16_t)(sumAbs1) << 3) + ((uint16_t)MIN(127, sumAbs) << 8);
-        }
-        else
-        {
-            templateCtxInit[id] = 0;
-        }
+      templateCtxInit[id] = (uint16_t)(sumNum) + ((uint16_t)(sumAbs1) << 3) + ((uint16_t)MIN(127, sumAbs) << 8);
     }
-    memset(currState->m_absLevelsAndCtxInit, 0, 16 * sizeof(uint8_t));
-    memcpy(currState->m_absLevelsAndCtxInit + 8, templateCtxInit, 16 * sizeof(uint16_t));
+    else {
+      templateCtxInit[id] = 0;
+    }
+  }
+  memset(currState->m_absLevelsAndCtxInit, 0, 16 * sizeof(uint8_t));
+  memcpy(currState->m_absLevelsAndCtxInit + 8, templateCtxInit, 16 * sizeof(uint16_t));
 }
 
 
-static INLINE void updateStateEOS(depquant_state * state, const ScanInfo  *scanInfo, const depquant_state* prevStates, const depquant_state* skipStates,
-    const Decision *decision)
+static INLINE void updateStateEOS(
+  depquant_state * state, 
+  const uint32_t  scan_pos,
+  const uint32_t sigCtxOffsetNext,
+  const uint32_t gtxCtxOffsetNext,
+  const uint32_t width_in_sbb,
+  const uint32_t height_in_sbb,
+  const uint32_t sigNSbb,
+  const depquant_state* prevStates,
+  const depquant_state* skipStates,
+  const Decision *decision)
 {
     state->m_rdCost = decision->rdCost;
     if (decision->prevId > -2)
     {
-        const depquant_state* prvState = 0;
-        if (decision->prevId >= 4)
-        {
-            prvState = skipStates + (decision->prevId - 4);
-            state->m_numSigSbb = 0;
-            memset(state->m_absLevelsAndCtxInit, 0, 16 * sizeof(uint8_t));
-        }
-        else if (decision->prevId >= 0)
-        {
-            prvState = prevStates + decision->prevId;
-            state->m_numSigSbb = prvState->m_numSigSbb + !!decision->absLevel;
-            memcpy(state->m_absLevelsAndCtxInit, prvState->m_absLevelsAndCtxInit, 16 * sizeof(uint8_t));
-        }
-        else
-        {
-            state->m_numSigSbb = 1;
-            memset(state->m_absLevelsAndCtxInit, 0, 16 * sizeof(uint8_t));
-        }
-        reinterpret_cast<uint8_t*>(m_absLevelsAndCtxInit)[scanInfo.insidePos] = (uint8_t)MIN(255, decision->absLevel);
+      const depquant_state* prvState = 0;
+      if (decision->prevId >= 4)
+      {
+          prvState = skipStates + (decision->prevId - 4);
+          state->m_numSigSbb = 0;
+          memset(state->m_absLevelsAndCtxInit, 0, 16 * sizeof(uint8_t));
+      }
+      else if (decision->prevId >= 0)
+      {
+          prvState = prevStates + decision->prevId;
+          state->m_numSigSbb = prvState->m_numSigSbb + !!decision->absLevel;
+          memcpy(state->m_absLevelsAndCtxInit, prvState->m_absLevelsAndCtxInit, 16 * sizeof(uint8_t));
+      }
+      else
+      {
+          state->m_numSigSbb = 1;
+          memset(state->m_absLevelsAndCtxInit, 0, 16 * sizeof(uint8_t));
+      }
+      uint8_t* temp = (uint8_t*)(state->m_absLevelsAndCtxInit[scan_pos & 15]);
+      *temp = (uint8_t)MIN(255, decision->absLevel);
 
-        update_common_context(state->m_commonCtx, scanInfo, prvState, state);
-
-        coeff_t  tinit = state->m_absLevelsAndCtxInit[8 + scanInfo.nextInsidePos];
-        coeff_t  sumNum = tinit & 7;
-        coeff_t  sumAbs1 = (tinit >> 3) & 31;
-        coeff_t  sumGt1 = sumAbs1 - sumNum;
-        state->m_sigFracBits = state->m_sigFracBitsArray[scanInfo.sigCtxOffsetNext + MIN((sumAbs1 + 1) >> 1, 3)];
-        state->m_coeffFracBits = state->m_gtxFracBitsArray[scanInfo.gtxCtxOffsetNext + (sumGt1 < 4 ? sumGt1 : 4)];
+      update_common_context(state->m_commonCtx, scan_pos, width_in_sbb, height_in_sbb, sigNSbb, prvState, state);
+      
+      coeff_t  tinit = state->m_absLevelsAndCtxInit[8 + ((scan_pos - 1) & 15)];
+      coeff_t  sumNum = tinit & 7;
+      coeff_t  sumAbs1 = (tinit >> 3) & 31;
+      coeff_t  sumGt1 = sumAbs1 - sumNum;
+      state->m_sigFracBits[0] = state->m_sigFracBitsArray[sigCtxOffsetNext + MIN((sumAbs1 + 1) >> 1, 3)][0];
+      state->m_sigFracBits[1] = state->m_sigFracBitsArray[sigCtxOffsetNext + MIN((sumAbs1 + 1) >> 1, 3)][1];
+      
+      memcpy(state->m_coeffFracBits, state->m_gtxFracBitsArray[gtxCtxOffsetNext + (sumGt1 < 4 ? sumGt1 : 4)], sizeof(state->m_coeffFracBits));
     }
 }
 
-static INLINE void updateState(depquant_state* state, int numIPos, const ScanInfo scanInfo, const depquant_state *prevStates, const Decision *decision, const int baseLevel, const bool extRiceFlag)
-{
+static INLINE void updateState(
+  depquant_state* state, 
+  int numIPos, const uint32_t scan_pos,
+  const depquant_state* prevStates,
+  const Decision* decision,
+  const uint32_t sigCtxOffsetNext,
+  const uint32_t gtxCtxOffsetNext,
+  const NbInfoSbb next_nb_info_ssb,
+  const int baseLevel,
+  const bool extRiceFlag) {
     state->m_rdCost = decision->rdCost;
     if (decision->prevId > -2)
     {
@@ -710,14 +895,14 @@ static INLINE void updateState(depquant_state* state, int numIPos, const ScanInf
         }
 
         uint8_t* levels = (uint8_t*)(state->m_absLevelsAndCtxInit);
-        levels[scanInfo.insidePos] = (uint8_t)MIN(255, decision->absLevel);
+        levels[scan_pos & 15] = (uint8_t)MIN(255, decision->absLevel);
 
         if (state->m_remRegBins >= 4)
         {
-            coeff_t  tinit = state->m_absLevelsAndCtxInit[8 + scanInfo.nextInsidePos];
+            coeff_t  tinit = state->m_absLevelsAndCtxInit[8 + ((scan_pos - 1) & 15)];
             coeff_t  sumAbs1 = (tinit >> 3) & 31;
             coeff_t sumNum = tinit & 7;
-#define UPDATE(k) {coeff_t t=levels[scanInfo.nextNbInfoSbb.inPos[k]]; sumAbs1+=MIN(4+(t&1),t); sumNum+=!!t; }
+#define UPDATE(k) {coeff_t t=levels[next_nb_info_ssb.inPos[k]]; sumAbs1+=MIN(4+(t&1),t); sumNum+=!!t; }
             if (numIPos == 1)
             {
                 UPDATE(0);
@@ -750,13 +935,13 @@ static INLINE void updateState(depquant_state* state, int numIPos, const ScanInf
             }
 #undef UPDATE
             coeff_t sumGt1 = sumAbs1 - sumNum;
-            state->m_sigFracBits[0] = state->m_sigFracBitsArray[scanInfo.sigCtxOffsetNext + MIN((sumAbs1 + 1) >> 1, 3)][0];
-            state->m_sigFracBits[1] = state->m_sigFracBitsArray[scanInfo.sigCtxOffsetNext + MIN((sumAbs1 + 1) >> 1, 3)][1];
-            memcpy(state->m_coeffFracBits, &state->m_gtxFracBitsArray[scanInfo.gtxCtxOffsetNext + (sumGt1 < 4 ? sumGt1 : 4)], sizeof(state->m_coeffFracBits));
+            state->m_sigFracBits[0] = state->m_sigFracBitsArray[sigCtxOffsetNext + MIN((sumAbs1 + 1) >> 1, 3)][0];
+            state->m_sigFracBits[1] = state->m_sigFracBitsArray[sigCtxOffsetNext + MIN((sumAbs1 + 1) >> 1, 3)][1];
+            memcpy(state->m_coeffFracBits, state->m_gtxFracBitsArray[gtxCtxOffsetNext + (sumGt1 < 4 ? sumGt1 : 4)], sizeof(state->m_coeffFracBits));
             
 
-            coeff_t  sumAbs = state->m_absLevelsAndCtxInit[8 + scanInfo.nextInsidePos] >> 8;
-#define UPDATE(k) {coeff_t t=levels[scanInfo.nextNbInfoSbb.inPos[k]]; sumAbs+=t; }
+            coeff_t  sumAbs = state->m_absLevelsAndCtxInit[8 + ((scan_pos - 1) & 15)] >> 8;
+#define UPDATE(k) {coeff_t t=levels[next_nb_info_ssb.inPos[k]]; sumAbs+=t; }
             if (numIPos == 1)
             {
                 UPDATE(0);
@@ -804,8 +989,8 @@ static INLINE void updateState(depquant_state* state, int numIPos, const ScanInf
         }
         else
         {
-            coeff_t  sumAbs = state->m_absLevelsAndCtxInit[8 + scanInfo.nextInsidePos] >> 8;
-#define UPDATE(k) {coeff_t t=levels[scanInfo.nextNbInfoSbb.inPos[k]]; sumAbs+=t; }
+            coeff_t  sumAbs = state->m_absLevelsAndCtxInit[8 + ((scan_pos - 1) & 15)] >> 8;
+#define UPDATE(k) {coeff_t t=levels[next_nb_info_ssb.inPos[k]]; sumAbs+=t; }
             if (numIPos == 1)
             {
                 UPDATE(0);
@@ -856,37 +1041,56 @@ static INLINE void updateState(depquant_state* state, int numIPos, const ScanInf
 }
 
 static void xDecideAndUpdate(
-    const coeff_t absCoeff,
-    const ScanInfo scanInfo, 
-    bool zeroOut,
-    coeff_t quantCoeff,
-    int effWidth,
-    int effHeight, 
-    bool reverseLast,
-    Decision* decisions)
+  rate_estimator* re,
+  context_store* ctxs,
+  const coeff_t absCoeff,
+  const uint32_t scan_pos,
+  const uint32_t pos_x,
+  const uint32_t pos_y,
+  const uint32_t sigCtxOffsetNext,
+  const uint32_t gtxCtxOffsetNext,
+  const uint32_t width_in_sbb,
+  const uint32_t height_in_sbb,
+  const uint32_t sigNSbb,
+  const NbInfoSbb next_nb_info_ssb,
+  bool zeroOut,
+  coeff_t quantCoeff,
+  int effWidth,
+  int effHeight)
 {
-  std::swap(m_prevStates, m_currStates);
+  Decision* decisions = ctxs->m_trellis[scan_pos];
+  SWAP(ctxs->m_currStates, ctxs->m_prevStates, depquant_state*);
 
-  xDecide(scanInfo.spt, absCoeff, lastOffset(scanInfo.scanIdx, effWidth, effHeight, reverseLast), decisions, zeroOut, quantCoeff);
+  enum ScanPosType spt = 0;
+  if ((scan_pos & 15) == 15 && scan_pos > 16 && scan_pos < effHeight * effWidth - 1)
+  {
+    spt = SCAN_SOCSBB;
+  }
+  else if ((scan_pos & 15) == 0 && scan_pos > 0 && scan_pos < effHeight * effWidth - 16)
+  {
+    spt = SCAN_EOCSBB;
+  }
 
-  if (scanInfo.scanIdx) {
-    if (scanInfo.eosbb) {
-      m_commonCtx.swap();
-      updateStateEOS(&m_currStates[0], scanInfo, m_prevStates, m_skipStates, &decisions[0]);
-      updateStateEOS(&m_currStates[1], scanInfo, m_prevStates, m_skipStates, &decisions[1]);
-      updateStateEOS(&m_currStates[2], scanInfo, m_prevStates, m_skipStates, &decisions[2]);
-      updateStateEOS(&m_currStates[3], scanInfo, m_prevStates, m_skipStates, &decisions[3]);
+  xDecide(ctxs->m_skipStates, ctxs->m_prevStates, &ctxs->m_startState, &ctxs->m_quant, spt, absCoeff, re->m_lastBitsX[pos_x] + re->m_lastBitsY[pos_y], decisions, zeroOut, quantCoeff);
+
+  if (scan_pos) {
+    if (!(scan_pos & 15)) {
+      SWAP(ctxs->m_common_context.m_currSbbCtx, ctxs->m_common_context.m_prevSbbCtx, SbbCtx*);
+      updateStateEOS(&ctxs->m_currStates[0], scan_pos, sigCtxOffsetNext, gtxCtxOffsetNext, width_in_sbb, height_in_sbb, sigNSbb, ctxs->m_prevStates, ctxs->m_skipStates, &decisions[0]);
+      updateStateEOS(&ctxs->m_currStates[1], scan_pos, sigCtxOffsetNext, gtxCtxOffsetNext, width_in_sbb, height_in_sbb, sigNSbb, ctxs->m_prevStates, ctxs->m_skipStates, &decisions[1]);
+      updateStateEOS(&ctxs->m_currStates[2], scan_pos, sigCtxOffsetNext, gtxCtxOffsetNext, width_in_sbb, height_in_sbb, sigNSbb, ctxs->m_prevStates, ctxs->m_skipStates, &decisions[2]);
+      updateStateEOS(&ctxs->m_currStates[3], scan_pos, sigCtxOffsetNext, gtxCtxOffsetNext, width_in_sbb, height_in_sbb, sigNSbb, ctxs->m_prevStates, ctxs->m_skipStates, &decisions[3]);
       memcpy(decisions + 4, decisions, 4 * sizeof(Decision));
     } else if (!zeroOut) {
 
-      updateState(&m_currStates[0], scanInfo.nextNbInfoSbb.num, scanInfo, m_prevStates, decisions[0], m_baseLevel, m_extRiceRRCFlag);
-      updateState(&m_currStates[1], scanInfo.nextNbInfoSbb.num, scanInfo, m_prevStates, decisions[1], m_baseLevel, m_extRiceRRCFlag);
-      updateState(&m_currStates[2], scanInfo.nextNbInfoSbb.num, scanInfo, m_prevStates, decisions[2], m_baseLevel, m_extRiceRRCFlag);
-      updateState(&m_currStates[3], scanInfo.nextNbInfoSbb.num, scanInfo, m_prevStates, decisions[3], m_baseLevel, m_extRiceRRCFlag);
+      updateState(&ctxs->m_currStates[0], next_nb_info_ssb.num, scan_pos, ctxs->m_prevStates, &decisions[0], sigCtxOffsetNext, gtxCtxOffsetNext, next_nb_info_ssb, 4, false);
+      updateState(&ctxs->m_currStates[1], next_nb_info_ssb.num, scan_pos, ctxs->m_prevStates, &decisions[1], sigCtxOffsetNext, gtxCtxOffsetNext, next_nb_info_ssb, 4, false);
+      updateState(&ctxs->m_currStates[2], next_nb_info_ssb.num, scan_pos, ctxs->m_prevStates, &decisions[2], sigCtxOffsetNext, gtxCtxOffsetNext, next_nb_info_ssb, 4, false);
+      updateState(&ctxs->m_currStates[3], next_nb_info_ssb.num, scan_pos, ctxs->m_prevStates, &decisions[3], sigCtxOffsetNext, gtxCtxOffsetNext, next_nb_info_ssb, 4, false);
     }
 
-    if (scanInfo.spt == SCAN_SOCSBB) {
-      std::swap(m_prevStates, m_skipStates);
+    if (spt == SCAN_SOCSBB) {
+      SWAP(ctxs->m_skipStates, ctxs->m_prevStates, depquant_state*);
     }
   }
 }
@@ -907,6 +1111,10 @@ uint8_t uvg_dep_quant(
   const encoder_control_t* const encoder = state->encoder_control;
   //===== reset / pre-init =====
   const int baseLevel = 4;
+  context_store dep_quant_context;
+  dep_quant_context.m_currStates = &dep_quant_context.m_allStates[0];
+  dep_quant_context.m_prevStates = &dep_quant_context.m_allStates[4];
+  dep_quant_context.m_skipStates = &dep_quant_context.m_allStates[8];
   
   const uint32_t  width           = compID == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
   const uint32_t  height          = compID == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
@@ -925,6 +1133,7 @@ uint8_t uvg_dep_quant(
     const uint32_t  log2_tr_width  = uvg_g_convert_to_log2[width];
   const uint32_t  log2_tr_height = uvg_g_convert_to_log2[height];
   const uint32_t* const scan     = uvg_get_scan_order_table(SCAN_GROUP_4X4,0,log2_tr_width,log2_tr_height);
+  const uint32_t* const cg_scan     = uvg_get_scan_order_table(SCAN_GROUP_UNGROUPED,0,log2_tr_width,log2_tr_height);
 
   int32_t qp_scaled = uvg_get_scaled_qp(compID, state->qp, (encoder->bitdepth - 8) * 6, encoder->qp_map[0]);
   qp_scaled = is_ts ? MAX(qp_scaled, 4 + 6 * MIN_QP_PRIME_TS) : qp_scaled;
@@ -936,11 +1145,9 @@ uint8_t uvg_dep_quant(
   const int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_height + log2_tr_width) >> 1) - needs_block_size_trafo_scale; //!< Represents scaling through forward transform
   const int64_t q_bits = QUANT_SHIFT + qp_scaled / 6 + (is_ts ? 0 : transform_shift );
   const int32_t add = ((state->frame->slicetype == UVG_SLICE_I) ? 171 : 85) << (q_bits - 9);
-
-  quant_block   quant_block;
-  init_quant_block(state, &quant_block, cur_tu, log2_tr_width, log2_tr_height, compID, needs_block_size_trafo_scale, -1);
-
-  Decision trellis[TR_MAX_WIDTH * TR_MAX_WIDTH][8];
+  
+  init_quant_block(state, &dep_quant_context.m_quant, cur_tu, log2_tr_width, log2_tr_height, compID, needs_block_size_trafo_scale, -1);
+  
   //===== scaling matrix ====
   //const int         qpDQ = cQP.Qp + 1;
   //const int         qpPer = qpDQ / 6;
@@ -970,7 +1177,7 @@ uint8_t uvg_dep_quant(
   const int32_t default_quant_coeff = uvg_g_quant_scales[needs_block_size_trafo_scale][qp_scaled % 6];
   const coeff_t thres                          = 4 << q_bits;
   for (; firstTestPos >= 0; firstTestPos--) {
-    coeff_t thresTmp = (enableScalingLists) ? (thres / (4 * q_coeff[scan[firstTestPos]])) :(thres / (4 * default_quant_coeff));
+    coeff_t thresTmp = (enableScalingLists) ? (thres / (4 * q_coeff[scan[firstTestPos]])) : (thres / (4 * default_quant_coeff));
     if (abs(srcCoeff[scan[firstTestPos]]) > thresTmp) {
       break;
     }
@@ -983,46 +1190,81 @@ uint8_t uvg_dep_quant(
   rate_estimator rate_estimator;
   init_rate_esimator(&rate_estimator, &state->search_cabac, compID);
   xSetLastCoeffOffset(state, cur_tu, cu_loc, &rate_estimator, cbf_is_set(cur_tu->cbf, COLOR_U), compID);
-  common_context common_context;
-  reset_common_context(&common_context, &rate_estimator, (width * height) >> 4, numCoeff);
-  depquant_state all_state[12];
-  depquant_state start_state;
 
+  reset_common_context(&dep_quant_context.m_common_context, &rate_estimator, (width * height) >> 4, numCoeff);
+  dep_quant_context.m_common_context.m_nbInfo = encoder->m_scanId2NbInfoOutArray[log2_tr_width][log2_tr_height];
+  
 
   int effectHeight = MIN(32, effHeight);
   int effectWidth = MIN(32, effWidth);
   for (int k = 0; k < 12; k++) {
-    depquant_state_init(&all_state[k], rate_estimator.m_sigFracBits[0][0], rate_estimator.m_gtxFracBits[0]);
-    all_state[k].effHeight = effectHeight;
-    all_state[k].effWidth = effectWidth;
+    depquant_state_init(&dep_quant_context.m_allStates[k], rate_estimator.m_sigFracBits[0][0], rate_estimator.m_gtxFracBits[0]);
+    dep_quant_context.m_allStates[k].effHeight = effectHeight;
+    dep_quant_context.m_allStates[k].effWidth = effectWidth;
   }
-  depquant_state_init(&start_state, rate_estimator.m_sigFracBits[0][0], rate_estimator.m_gtxFracBits[0]);
-  start_state.effHeight = effectHeight;
-  start_state.effWidth = effectWidth;
-  
+  depquant_state_init(&dep_quant_context.m_startState, rate_estimator.m_sigFracBits[0][0], rate_estimator.m_gtxFracBits[0]);
+  dep_quant_context.m_startState.effHeight = effectHeight;
+  dep_quant_context.m_startState.effWidth = effectWidth;
+
+
+  const uint32_t height_in_sbb = MAX(height >> 2, 1);
+  const uint32_t width_in_sbb = MAX(width >> 2, 1);
   //===== populate trellis =====
   for (int scanIdx = firstTestPos; scanIdx >= 0; scanIdx--) {
-    uint32_t scan_pos = scan[scanIdx];
+    uint32_t blkpos = scan[scanIdx];
+    uint32_t  pos_y = blkpos >> log2_tr_width;
+    uint32_t  pos_x = blkpos - (pos_y << log2_tr_width);
+
+    uint32_t cg_blockpos = scanIdx ? cg_scan[(scanIdx -1) >> 4] : 0;
+    uint32_t cg_pos_y = cg_blockpos / height_in_sbb;
+    uint32_t cg_pos_x = cg_blockpos - (cg_pos_y * height_in_sbb);
+    uint32_t diag = cg_pos_y + cg_pos_x;
+
+    uint32_t sig_ctx_offset = compID == COLOR_Y ? (diag < 2 ? 8 : diag < 5 ? 4 : 0) : (diag < 2 ? 4 : 0);
+    uint32_t gtx_ctx_offset = compID == COLOR_Y ? (diag < 1 ? 16 : diag < 3 ? 11 : diag < 10 ? 6 : 1) : (diag < 1 ? 6 : 1);
+
+    uint32_t nextSbbRight = (cg_pos_x < width_in_sbb - 1 ? cg_blockpos + 1 : 0);
+    uint32_t nextSbbBelow = (cg_pos_y < height_in_sbb - 1 ? cg_blockpos + width_in_sbb : 0);
+
     if (enableScalingLists) {
-      init_quant_block(state, &quant_block, cur_tu, log2_tr_width, log2_tr_height, compID, needs_block_size_trafo_scale, q_coeff[scan_pos]);
+      init_quant_block(state, &dep_quant_context.m_quant, cur_tu, log2_tr_width, log2_tr_height, compID, needs_block_size_trafo_scale, q_coeff[blkpos]);
 
       xDecideAndUpdate(
-        abs(srcCoeff[scan_pos]),
-        scanInfo,
-        (zeroOut && (scanInfo.posX >= effWidth || scanInfo.posY >= effHeight)),
-        q_coeff[scan_pos],
+        &rate_estimator,
+        &dep_quant_context,
+        abs(srcCoeff[blkpos]),
+        scanIdx,
+        pos_x,
+        pos_y,
+        sig_ctx_offset,
+        gtx_ctx_offset,
+        width_in_sbb,
+        height_in_sbb,
+        nextSbbRight || nextSbbBelow,
+        encoder->m_scanId2NbInfoSbbArray[log2_tr_width][log2_tr_height][scanIdx ? scanIdx - 1 : 0],
+        (zeroOut && (pos_x >= effWidth || pos_y >= effHeight)),
+        q_coeff[blkpos],
         effectWidth,
-        effectHeight,
-        false); //tu.cu->slice->getReverseLastSigCoeffFlag());
+        effectHeight
+        ); //tu.cu->slice->getReverseLastSigCoeffFlag());
     } else {
       xDecideAndUpdate(
-        abs(srcCoeff[scan_pos]),
-        scanInfo,
-        (zeroOut && (scanInfo.posX >= effWidth || scanInfo.posY >= effHeight)),
+        &rate_estimator,
+        &dep_quant_context,
+        abs(srcCoeff[blkpos]),
+        scanIdx,
+        pos_x,
+        pos_y,
+        sig_ctx_offset,
+        gtx_ctx_offset,
+        width_in_sbb,
+        height_in_sbb,
+        nextSbbRight || nextSbbBelow,
+        encoder->m_scanId2NbInfoSbbArray[log2_tr_width][log2_tr_height][scanIdx ? scanIdx - 1 : 0],
+        (zeroOut && (pos_x >= effWidth || pos_y >= effHeight)),
         default_quant_coeff,
         effectWidth,
-        effectHeight,
-        false); //tu.cu->slice->getReverseLastSigCoeffFlag());
+        effectHeight); //tu.cu->slice->getReverseLastSigCoeffFlag());
       }
   }
 
@@ -1030,7 +1272,7 @@ uint8_t uvg_dep_quant(
   Decision decision    = {INT64_MAX, -1, -2};
   int64_t  minPathCost = 0;
   for (int8_t stateId = 0; stateId < 4; stateId++) {
-    int64_t pathCost = trellis[0][stateId].rdCost;
+    int64_t pathCost = dep_quant_context.m_trellis[0][stateId].rdCost;
     if (pathCost < minPathCost) {
       decision.prevId = stateId;
       minPathCost     = pathCost;
@@ -1040,7 +1282,7 @@ uint8_t uvg_dep_quant(
   //===== backward scanning =====
   int scanIdx = 0;
   for (; decision.prevId >= 0; scanIdx++) {
-    decision       = trellis[scanIdx][decision.prevId];
+    decision       = dep_quant_context.m_trellis[scanIdx][decision.prevId];
     int32_t blkpos = scan[scanIdx];
     coeff_out[blkpos] = (srcCoeff[blkpos] < 0 ? -decision.absLevel : decision.absLevel);
     absSum += decision.absLevel;
diff --git a/src/dep_quant.h b/src/dep_quant.h
index 35fec0b5..0e1d20ca 100644
--- a/src/dep_quant.h
+++ b/src/dep_quant.h
@@ -35,6 +35,22 @@
 
 #include "global.h"
 
+typedef struct encoder_control_t encoder_control_t;
 
+typedef struct
+{
+  uint8_t num;
+  uint8_t inPos[5];
+} NbInfoSbb;
+
+typedef struct
+{
+  uint16_t maxDist;
+  uint16_t num;
+  uint16_t outPos[5];
+} NbInfoOut;
+
+int uvg_init_nb_info(encoder_control_t* encoder);
+void uvg_dealloc_nb_info(encoder_control_t* encoder);
 
 #endif
diff --git a/src/encoder.c b/src/encoder.c
index f3d7653a..56d03305 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -320,6 +320,13 @@ encoder_control_t* uvg_encoder_control_init(const uvg_config *const cfg)
     encoder->scaling_list.use_default_list = 1;
   }
 
+  if(cfg->dep_quant) {
+    if(!uvg_init_nb_info(encoder)) {
+      fprintf(stderr, "Could not initialize nb info.\n");
+      goto init_failed;      
+    }
+  }
+
   // ROI / delta QP
   if (cfg->roi.file_path) {
     const char *mode[2] = { "r", "rb" };
diff --git a/src/encoder.h b/src/encoder.h
index be835890..81b091b3 100644
--- a/src/encoder.h
+++ b/src/encoder.h
@@ -38,6 +38,7 @@
  * Initialization of encoder_control_t.
  */
 
+#include "dep_quant.h"
 #include "global.h" // IWYU pragma: keep
 #include "uvg266.h"
 #include "scalinglist.h"
@@ -98,6 +99,9 @@ typedef struct encoder_control_t
   //scaling list
   scaling_list_t scaling_list;
 
+  NbInfoSbb* m_scanId2NbInfoSbbArray[7 + 1][7 + 1];
+  NbInfoOut* m_scanId2NbInfoOutArray[7 + 1][7 + 1];
+
   //spec: references to variables defined in Rec. ITU-T H.265 (04/2013)
   int8_t tiles_enable; /*!<spec: tiles_enabled */
 
diff --git a/src/rdo.c b/src/rdo.c
index 17de36bb..18f65f12 100644
--- a/src/rdo.c
+++ b/src/rdo.c
@@ -1252,7 +1252,7 @@ int uvg_ts_rdoq(encoder_state_t* const state, coeff_t* src_coeff, coeff_t* dest_
       int last_pos_coded = sbSizeM1;
       uint32_t blkpos = scan[scan_pos];
       uint32_t  pos_y = blkpos >> log2_block_width;
-      uint32_t  pos_x = blkpos - (pos_y << log2_block_width); // TODO: height
+      uint32_t  pos_x = blkpos - (pos_y << log2_block_width); 
       //===== quantization =====
 
       // set coeff
diff --git a/src/uvg266.h b/src/uvg266.h
index fe6e2b0f..c71a835a 100644
--- a/src/uvg266.h
+++ b/src/uvg266.h
@@ -552,7 +552,7 @@ typedef struct uvg_config
   uint8_t intra_rough_search_levels;
 
   uint8_t ibc; /* \brief Intra Block Copy parameter */
-
+  uint8_t dep_quant;
 } uvg_config;
 
 /**

From f8994a7fae30055e11664e8f740b9a8cab867764 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 11 Jan 2023 09:25:34 +0200
Subject: [PATCH 183/254] [DepQuant] WIP: dequant

---
 src/dep_quant.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++---
 src/dep_quant.h | 21 +++++++++++++
 2 files changed, 101 insertions(+), 4 deletions(-)

diff --git a/src/dep_quant.c b/src/dep_quant.c
index ff9f62be..54d656e0 100644
--- a/src/dep_quant.c
+++ b/src/dep_quant.c
@@ -1096,7 +1096,7 @@ static void xDecideAndUpdate(
 }
 
 
-uint8_t uvg_dep_quant(
+int uvg_dep_quant(
   const encoder_state_t* const state,
   const cu_info_t* const cur_tu,
   const cu_loc_t* const cu_loc,
@@ -1104,8 +1104,7 @@ uint8_t uvg_dep_quant(
   coeff_t* coeff_out,
   const color_t compID,
   enum uvg_tree_type tree_type,
-  const double lambda,
-  coeff_t* absSum,
+  int* absSum,
   const bool enableScalingLists)
 {
   const encoder_control_t* const encoder = state->encoder_control;
@@ -1285,6 +1284,83 @@ uint8_t uvg_dep_quant(
     decision       = dep_quant_context.m_trellis[scanIdx][decision.prevId];
     int32_t blkpos = scan[scanIdx];
     coeff_out[blkpos] = (srcCoeff[blkpos] < 0 ? -decision.absLevel : decision.absLevel);
-    absSum += decision.absLevel;
+    *absSum += decision.absLevel;
+  }
+  return *absSum;
+}
+
+
+void uvg_dep_quant_dequant(
+  const encoder_state_t* const state,
+  const cu_info_t* const cur_tu,
+  const cu_loc_t* const cu_loc,
+  const color_t compID,
+  coeff_t* quant_coeff,
+  coeff_t * coeff, 
+  bool enableScalingLists)
+{
+  const encoder_control_t* const encoder = state->encoder_control;
+  const uint32_t  width = compID == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
+  const uint32_t  height = compID == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
+
+  const int       numCoeff = width * height;
+  
+  const uint32_t  log2_tr_width = uvg_g_convert_to_log2[width];
+  const uint32_t  log2_tr_height = uvg_g_convert_to_log2[height];
+  const uint32_t* const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, 0, log2_tr_width, log2_tr_height);
+  bool needs_block_size_trafo_scale =((log2_tr_height + log2_tr_width) % 2 == 1);
+  needs_block_size_trafo_scale |= 0; // Non log2 block size
+
+  //----- reset coefficients and get last scan index -----
+  memset(coeff, 0, numCoeff * sizeof(coeff_t));
+  int lastScanIdx = -1;
+  for (int scanIdx = numCoeff - 1; scanIdx >= 0; scanIdx--)
+  {
+    if (quant_coeff[scan[scanIdx]])
+    {
+      lastScanIdx = scanIdx;
+      break;
+    }
+  }
+  if (lastScanIdx < 0)
+  {
+    return;
+  }
+
+  //----- set dequant parameters -----
+  const int         qpDQ = uvg_get_scaled_qp(compID, state->qp, (encoder->bitdepth - 8) * 6, encoder->qp_map[0]); + 1;
+  const int         qpPer = qpDQ / 6;
+  const int         qpRem = qpDQ - 6 * qpPer;
+  const int         channelBitDepth = encoder->bitdepth;
+  const int         maxLog2TrDynamicRange = MAX_TR_DYNAMIC_RANGE;
+  const coeff_t      minTCoeff = -(1 << maxLog2TrDynamicRange);
+  const coeff_t      maxTCoeff = (1 << maxLog2TrDynamicRange) - 1;
+  const int         transformShift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_height + log2_tr_width) >> 1) - needs_block_size_trafo_scale;
+  int  shift = IQUANT_SHIFT + 1 - qpPer - transformShift + (enableScalingLists ? 4 : 0);
+  int  invQScale = uvg_g_inv_quant_scales[needs_block_size_trafo_scale ? 1 : 0][qpRem];
+  int  add = (shift < 0) ? 0 : ((1 << shift) >> 1);
+  int32_t scalinglist_type = (cur_tu->type == CU_INTRA ? 0 : 3) + (int8_t)(compID);
+
+  const int32_t* dequant_coef = encoder->scaling_list.de_quant_coeff[log2_tr_width][log2_tr_height][scalinglist_type][qpDQ % 6];
+  //----- dequant coefficients -----
+  for (int state = 0, scanIdx = lastScanIdx; scanIdx >= 0; scanIdx--)
+  {
+    const unsigned  rasterPos = scan[scanIdx];
+    const coeff_t level = quant_coeff[rasterPos];
+    if (level)
+    {
+      if (enableScalingLists)
+      {
+        invQScale = dequant_coef[rasterPos];//scalingfactor*levelScale
+      }
+      if (shift < 0 && (enableScalingLists || scanIdx == lastScanIdx))
+      {
+        invQScale <<= -shift;
+      }
+      int  qIdx = (level << 1) + (level > 0 ? -(state >> 1) : (state >> 1));
+      int64_t  nomTCoeff = ((int64_t)qIdx * (int64_t)invQScale + add) >> ((shift < 0) ? 0 : shift);
+      coeff[rasterPos] = (coeff_t)CLIP(minTCoeff, maxTCoeff, nomTCoeff);
+    }
+    state = (32040 >> ((state << 2) + ((level & 1) << 1))) & 3;   // the 16-bit value "32040" represent the state transition table
   }
 }
diff --git a/src/dep_quant.h b/src/dep_quant.h
index 0e1d20ca..c237f81e 100644
--- a/src/dep_quant.h
+++ b/src/dep_quant.h
@@ -33,6 +33,7 @@
 #ifndef DEP_QUANT_H_
 #define DEP_QUANT_H_
 
+#include "cu.h"
 #include "global.h"
 
 typedef struct encoder_control_t encoder_control_t;
@@ -53,4 +54,24 @@ typedef struct
 int uvg_init_nb_info(encoder_control_t* encoder);
 void uvg_dealloc_nb_info(encoder_control_t* encoder);
 
+
+void uvg_dep_quant_dequant(
+  const encoder_state_t* const state,
+  const cu_info_t* const cur_tu,
+  const cu_loc_t* const cu_loc,
+  const color_t compID,
+  coeff_t* quant_coeff,
+  coeff_t* coeff,
+  bool enableScalingLists);
+
+int uvg_dep_quant(
+  const encoder_state_t* const state,
+  const cu_info_t* const cur_tu,
+  const cu_loc_t* const cu_loc,
+  const coeff_t* srcCoeff,
+  coeff_t* coeff_out,
+  const color_t compID,
+  enum uvg_tree_type tree_type,
+  int* absSum,
+  const bool enableScalingLists);
 #endif

From bfa699fac6ce5956ca615f08e09834b3e8e7a11a Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 11 Jan 2023 10:12:59 +0200
Subject: [PATCH 184/254] [DepQuant] WPP: API

---
 src/cfg.c                                           | 4 ++++
 src/cli.c                                           | 3 +++
 src/encoder_state-bitstream.c                       | 9 ++++++---
 src/strategies/generic/encode_coding_tree-generic.c | 2 +-
 4 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/src/cfg.c b/src/cfg.c
index a7555ddc..bf9e1307 100644
--- a/src/cfg.c
+++ b/src/cfg.c
@@ -242,6 +242,7 @@ int uvg_config_init(uvg_config *cfg)
 
   cfg->ibc = 0;
 
+  cfg->dep_quant = 0;
   return 1;
 }
 
@@ -1551,6 +1552,9 @@ int uvg_config_parse(uvg_config *cfg, const char *name, const char *value)
     }
     cfg->ibc = (uint8_t)ibc_value;
   }
+  else if OPT("dep-quant") {
+    cfg->dep_quant = (bool)atobool(value);
+  }
   else {
     return 0;
   }
diff --git a/src/cli.c b/src/cli.c
index ab91e844..ce238506 100644
--- a/src/cli.c
+++ b/src/cli.c
@@ -199,6 +199,8 @@ static const struct option long_options[] = {
   {"max_tt_size",         required_argument, NULL, 0 },
   { "intra-rough-granularity",required_argument, NULL, 0 },
   { "ibc",                required_argument, NULL, 0 },
+  { "dep-quant",                no_argument, NULL, 0 },
+  { "no-dep-quant",             no_argument, NULL, 0 },
   {0, 0, 0, 0}
 };
 
@@ -577,6 +579,7 @@ void print_help(void)
     "                                   - full: Full ALF\n"
     "      --(no-)rdoq            : Rate-distortion optimized quantization [enabled]\n"
     "      --(no-)rdoq-skip       : Skip RDOQ for 4x4 blocks. [disabled]\n"
+    "      --(no-)dep-quant       : Use dependent quantization. [disabled]\n"
     "      --(no-)signhide        : Sign hiding [disabled]\n"
     "      --rd <integer>         : Intra mode search complexity [0]\n"
     "                                   - 0: Skip intra if inter is good enough.\n"
diff --git a/src/encoder_state-bitstream.c b/src/encoder_state-bitstream.c
index 920331a5..f4716c67 100644
--- a/src/encoder_state-bitstream.c
+++ b/src/encoder_state-bitstream.c
@@ -689,7 +689,7 @@ static void encoder_state_write_bitstream_seq_parameter_set(bitstream_t* stream,
 
   WRITE_U(stream, 0, 1, "scaling_list_enabled_flag");
 
-  WRITE_U(stream, 0, 1, "pic_dep_quant_enabled_flag");
+  WRITE_U(stream, encoder->cfg.dep_quant, 1, "pic_dep_quant_enabled_flag");
 
   WRITE_U(stream, encoder->cfg.signhide_enable, 1, "pic_sign_data_hiding_enabled_flag");
 
@@ -1358,11 +1358,14 @@ void uvg_encoder_state_write_bitstream_slice_header(
   }
 
   // ToDo: depquant
+  if (encoder->cfg.dep_quant) {
+    WRITE_U(stream, 1, 1, "sh_dep_quant_used_flag");
+  }
 
-  if (state->encoder_control->cfg.signhide_enable) {
+  if (state->encoder_control->cfg.signhide_enable && !encoder->cfg.dep_quant) {
     WRITE_U(stream, 1, 1, "sh_sign_data_hiding_used_flag");
   }
-  if (state->encoder_control->cfg.trskip_enable && !state->encoder_control->cfg.signhide_enable /* && !cfg.dep_quant*/)
+  if (state->encoder_control->cfg.trskip_enable && !state->encoder_control->cfg.signhide_enable  && !encoder->cfg.dep_quant)
   {
     // TODO: find out what this is actually about and parametrize it
     WRITE_U(stream, 0, 1, "sh_ts_residual_coding_disabled_flag"); 
diff --git a/src/strategies/generic/encode_coding_tree-generic.c b/src/strategies/generic/encode_coding_tree-generic.c
index 48a5cc3d..21854920 100644
--- a/src/strategies/generic/encode_coding_tree-generic.c
+++ b/src/strategies/generic/encode_coding_tree-generic.c
@@ -302,7 +302,7 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
 
       uint32_t num_signs = num_non_zero;
 
-      if (state->encoder_control->cfg.signhide_enable && (last_nz_pos_in_cg - first_nz_pos_in_cg >= 4)) {
+      if (state->encoder_control->cfg.signhide_enable && !state->encoder_control->cfg.dep_quant && (last_nz_pos_in_cg - first_nz_pos_in_cg >= 4)) {
         num_signs--;
         coeff_signs >>= 1;
       }

From 5236bc93be90c56213c8d706dbbe00dde63cfce4 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 11 Jan 2023 14:17:18 +0200
Subject: [PATCH 185/254] [DepQuant] WIP: doesn't crash but bitstream is
 illegal and quality a lot worse

---
 src/dep_quant.c                        | 103 ++++++++++++++++---------
 src/dep_quant.h                        |   8 +-
 src/rate_control.c                     |  34 ++++++--
 src/strategies/avx2/quant-avx2.c       |  21 ++++-
 src/strategies/generic/quant-generic.c |  37 ++++++++-
 5 files changed, 155 insertions(+), 48 deletions(-)

diff --git a/src/dep_quant.c b/src/dep_quant.c
index 54d656e0..70f0e28d 100644
--- a/src/dep_quant.c
+++ b/src/dep_quant.c
@@ -69,9 +69,9 @@ typedef struct
   int m_QShift;
   int64_t m_QAdd;
   int64_t m_QScale;
-  coeff_t m_maxQIdx;
-  coeff_t m_thresLast;
-  coeff_t m_thresSSbb;
+  int64_t m_maxQIdx;
+  int64_t m_thresLast;
+  int64_t m_thresSSbb;
   // distortion normalization
   int m_DistShift;
   int64_t m_DistAdd;
@@ -135,9 +135,9 @@ typedef struct
   int8_t m_goRicePar;
   int8_t m_goRiceZero;
   int8_t m_stateId;
-  const uint32_t* m_sigFracBitsArray[2];
-  const uint32_t* m_gtxFracBitsArray[6];
-  struct common_context* m_commonCtx;
+  uint32_t *m_sigFracBitsArray[12];
+  int32_t *m_gtxFracBitsArray[21];
+  common_context* m_commonCtx;
 
   unsigned effWidth;
   unsigned effHeight;
@@ -159,12 +159,12 @@ typedef struct
 int uvg_init_nb_info(encoder_control_t * encoder) {
   memset(encoder->m_scanId2NbInfoSbbArray, 0, sizeof(encoder->m_scanId2NbInfoSbbArray));
   memset(encoder->m_scanId2NbInfoOutArray, 0, sizeof(encoder->m_scanId2NbInfoOutArray));
-  for (int hd = 0; hd <= 7; hd++)
+  for (int hd = 0; hd <= 6; hd++)
   {
 
     uint32_t raster2id[64 * 64] = {0};
 
-    for (int vd = 0; vd <= 7; vd++)
+    for (int vd = 0; vd <= 6; vd++)
     {
       if ((hd == 0 && vd <= 1) || (hd <= 1 && vd == 0))
       {
@@ -317,6 +317,21 @@ void uvg_dealloc_nb_info(encoder_control_t* encoder) {
 }
 
 
+static INLINE int ceil_log2(uint64_t x)
+{
+  static const uint64_t t[6] = { 0xFFFFFFFF00000000ull, 0x00000000FFFF0000ull, 0x000000000000FF00ull, 0x00000000000000F0ull, 0x000000000000000Cull, 0x0000000000000002ull };
+  int y = (((x & (x - 1)) == 0) ? 0 : 1);
+  int j = 32;
+  for (int i = 0; i < 6; i++)
+  {
+    int k = (((x & t[i]) == 0) ? 0 : j);
+    y += k;
+    x >>= k;
+    j >>= 1;
+  }
+  return y;
+}
+
 static void init_quant_block(
   const encoder_state_t* state,
   quant_block*           qp,
@@ -349,8 +364,8 @@ static void init_quant_block(
     maxLog2TrDynamicRange + 1,
     8 * sizeof(int) + invShift - IQUANT_SHIFT - 1);
   qp->m_maxQIdx = (1 << (qIdxBD - 1)) - 4;
-  qp->m_thresLast = (coeff_t)(((int64_t)(4) << qp->m_QShift));
-  qp->m_thresSSbb = (coeff_t)(((int64_t)(3) << qp->m_QShift));
+  qp->m_thresLast = (((int64_t)(4) << (int64_t)qp->m_QShift));
+  qp->m_thresSSbb = (((int64_t)(3) << (int64_t)qp->m_QShift));
   // distortion calculation parameters
   const int64_t qScale = (gValue == -1) ? qp->m_QScale : gValue;
   const int     nomDShift =
@@ -363,8 +378,7 @@ static void init_quant_block(
        1.0 / ((double)((int64_t)(1) << (-nomDShift)) * qScale2 * lambda) :
        (double)((int64_t)(1) << nomDShift) / (qScale2 * lambda));
   const int64_t pow2dfShift = (int64_t)(nomDistFactor * qScale2) + 1;
-  assert(pow2dfShift > 0xfffffffll);
-  const int dfShift = uvg_math_ceil_log2(pow2dfShift);
+  const int dfShift = ceil_log2(pow2dfShift);
   qp->m_DistShift = 62 + qp->m_QShift - 2 * maxLog2TrDynamicRange - dfShift;
   qp->m_DistAdd = ((int64_t)(1) << qp->m_DistShift) >> 1;
   qp->m_DistStepAdd = (int64_t)(nomDistFactor * (double)((int64_t)(1) << (qp->m_DistShift + qp->m_QShift)) + .5);
@@ -404,8 +418,8 @@ static void init_rate_esimator(rate_estimator * rate_estimator, const cabac_data
   numCtx    = (color == COLOR_Y? 21 : 11);
   for (unsigned ctxId = 0; ctxId < numCtx; ctxId++) {
     const cabac_ctx_t * par_ctx = color == COLOR_Y ? &ctx->ctx.cu_parity_flag_model_luma[ctxId] : &ctx->ctx.cu_parity_flag_model_chroma[ctxId];
-    const cabac_ctx_t * gt1_ctx = color == COLOR_Y ? &ctx->ctx.cu_gtx_flag_model_luma[0][ctxId] : &ctx->ctx.cu_gtx_flag_model_chroma[0][ctxId];
-    const cabac_ctx_t * gt2_ctx = color == COLOR_Y ? &ctx->ctx.cu_gtx_flag_model_luma[1][ctxId] : &ctx->ctx.cu_gtx_flag_model_chroma[1][ctxId];
+    const cabac_ctx_t * gt2_ctx = color == COLOR_Y ? &ctx->ctx.cu_gtx_flag_model_luma[0][ctxId] : &ctx->ctx.cu_gtx_flag_model_chroma[0][ctxId];
+    const cabac_ctx_t * gt1_ctx = color == COLOR_Y ? &ctx->ctx.cu_gtx_flag_model_luma[1][ctxId] : &ctx->ctx.cu_gtx_flag_model_chroma[1][ctxId];
 
     int32_t* cb = &rate_estimator->m_gtxFracBits[ctxId];
     int32_t par0    = (1 << SCALE_BITS) + (int32_t)CTX_ENTROPY_BITS(par_ctx, 0);
@@ -420,12 +434,13 @@ static void init_rate_esimator(rate_estimator * rate_estimator, const cabac_data
 }
 
 
-  static void xSetLastCoeffOffset(
+static void xSetLastCoeffOffset(
   const encoder_state_t* const state,
-  const cu_info_t* const       cur_tu,
-  const cu_loc_t* const        cu_loc,
-      rate_estimator* rate_estimator,
-      const bool cb_cbf,
+  const cu_info_t* const cur_tu,
+  const int width,
+  const int height,
+  rate_estimator* rate_estimator,
+  const bool cb_cbf,
   const color_t compID)
 {
   int32_t cbfDeltaBits = 0;
@@ -438,7 +453,7 @@ static void init_rate_esimator(rate_estimator * rate_estimator, const cabac_data
     if (useIntraSubPartitions) {
       bool     rootCbfSoFar       = false;
       bool     isLastSubPartition = false; //TODO: isp check
-      uint32_t nTus = uvg_get_isp_split_num(cu_loc->width, cu_loc->height, cur_tu->intra.isp_mode, true);
+      uint32_t nTus = uvg_get_isp_split_num(width, height, cur_tu->intra.isp_mode, true);
       if (isLastSubPartition) {
         //TransformUnit* tuPointer = tu.cu->firstTU;
         //for (int tuIdx = 0; tuIdx < nTus - 1; tuIdx++) {
@@ -477,7 +492,7 @@ static const unsigned prefixCtx[] = {0, 0, 0, 3, 6, 10, 15, 21};
   for (unsigned xy = 0; xy < 2; xy++) {
     int32_t        bitOffset  = (xy ? cbfDeltaBits : 0);
     int32_t*       lastBits   = (xy ? rate_estimator->m_lastBitsY : rate_estimator->m_lastBitsX);
-    const unsigned size = (xy ? (compID == COLOR_Y ? cu_loc->height : cu_loc->chroma_height) : (compID == COLOR_Y ? cu_loc->width : cu_loc->chroma_width));
+    const unsigned size = (xy ? (height) : (width));
     const unsigned log2Size   = uvg_math_ceil_log2(size);
     const bool     useYCtx    = (xy != 0);
     const cabac_ctx_t* const ctxSetLast = useYCtx ?
@@ -504,15 +519,18 @@ static const unsigned prefixCtx[] = {0, 0, 0, 3, 6, 10, 15, 21};
 
 static void depquant_state_init(depquant_state* state, uint32_t sig_frac_bits[2], uint32_t gtx_frac_bits[6])
 {
-  state->m_rdCost = INT64_MAX;
+  state->m_rdCost = INT64_MAX >> 1;
   state->m_numSigSbb = 0;
   state->m_remRegBins = 4; // just large enough for last scan pos
   state->m_refSbbCtxId = -1;
   state->m_sigFracBits[0] = sig_frac_bits[0];
   state->m_sigFracBits[1] = sig_frac_bits[1];
-  memcpy(state->m_coeffFracBits, gtx_frac_bits, sizeof(gtx_frac_bits));
+  memcpy(state->m_coeffFracBits, gtx_frac_bits, sizeof(state->m_coeffFracBits));
   state->m_goRicePar = 0;
   state->m_goRiceZero = 0;
+
+  state->m_sbbFracBits[0] = 0;
+  state->m_sbbFracBits[1] = 0;
 }
 
 static INLINE void checkRdCostSkipSbbZeroOut(Decision *decision, const depquant_state * const state) 
@@ -841,7 +859,7 @@ static INLINE void updateStateEOS(
           state->m_numSigSbb = 1;
           memset(state->m_absLevelsAndCtxInit, 0, 16 * sizeof(uint8_t));
       }
-      uint8_t* temp = (uint8_t*)(state->m_absLevelsAndCtxInit[scan_pos & 15]);
+      uint8_t* temp = (uint8_t*)(&state->m_absLevelsAndCtxInit[scan_pos & 15]);
       *temp = (uint8_t)MIN(255, decision->absLevel);
 
       update_common_context(state->m_commonCtx, scan_pos, width_in_sbb, height_in_sbb, sigNSbb, prvState, state);
@@ -1099,7 +1117,8 @@ static void xDecideAndUpdate(
 int uvg_dep_quant(
   const encoder_state_t* const state,
   const cu_info_t* const cur_tu,
-  const cu_loc_t* const cu_loc,
+  const int width,
+  const int height,
   const coeff_t* srcCoeff,
   coeff_t* coeff_out,
   const color_t compID,
@@ -1115,8 +1134,6 @@ int uvg_dep_quant(
   dep_quant_context.m_prevStates = &dep_quant_context.m_allStates[4];
   dep_quant_context.m_skipStates = &dep_quant_context.m_allStates[8];
   
-  const uint32_t  width           = compID == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
-  const uint32_t  height          = compID == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
   const uint32_t  lfnstIdx = tree_type != UVG_CHROMA_T  || compID == COLOR_Y ?
                                cur_tu->lfnst_idx :
                                cur_tu->cr_lfnst_idx;
@@ -1173,8 +1190,8 @@ int uvg_dep_quant(
     height >= 4) {
     firstTestPos =((width == 4 && height == 4) || (width == 8 && height == 8)) ? 7 : 15;
   }
-  const int32_t default_quant_coeff = uvg_g_quant_scales[needs_block_size_trafo_scale][qp_scaled % 6];
-  const coeff_t thres                          = 4 << q_bits;
+  const int32_t default_quant_coeff = dep_quant_context.m_quant.m_QScale;
+  const int32_t thres               = dep_quant_context.m_quant.m_thresLast;
   for (; firstTestPos >= 0; firstTestPos--) {
     coeff_t thresTmp = (enableScalingLists) ? (thres / (4 * q_coeff[scan[firstTestPos]])) : (thres / (4 * default_quant_coeff));
     if (abs(srcCoeff[scan[firstTestPos]]) > thresTmp) {
@@ -1188,7 +1205,7 @@ int uvg_dep_quant(
   //===== real init =====
   rate_estimator rate_estimator;
   init_rate_esimator(&rate_estimator, &state->search_cabac, compID);
-  xSetLastCoeffOffset(state, cur_tu, cu_loc, &rate_estimator, cbf_is_set(cur_tu->cbf, COLOR_U), compID);
+  xSetLastCoeffOffset(state, cur_tu, width, height, &rate_estimator, cbf_is_set(cur_tu->cbf, COLOR_U), compID);
 
   reset_common_context(&dep_quant_context.m_common_context, &rate_estimator, (width * height) >> 4, numCoeff);
   dep_quant_context.m_common_context.m_nbInfo = encoder->m_scanId2NbInfoOutArray[log2_tr_width][log2_tr_height];
@@ -1200,10 +1217,27 @@ int uvg_dep_quant(
     depquant_state_init(&dep_quant_context.m_allStates[k], rate_estimator.m_sigFracBits[0][0], rate_estimator.m_gtxFracBits[0]);
     dep_quant_context.m_allStates[k].effHeight = effectHeight;
     dep_quant_context.m_allStates[k].effWidth = effectWidth;
+    dep_quant_context.m_allStates[k].m_commonCtx = &dep_quant_context.m_common_context;
+    int i1 = (k & 3) ? (k & 3) - 1 : 0;
+    dep_quant_context.m_allStates[k].m_stateId = i1;
+    for (int i = 0; i < (compID == COLOR_Y ? 12 : 8); ++i) {
+      dep_quant_context.m_allStates[k].m_sigFracBitsArray[i] = rate_estimator.m_sigFracBits[i1][i];
+    }
+    for (int i = 0; i < (compID == COLOR_Y ? 21 : 11); ++i) {
+      dep_quant_context.m_allStates[k].m_gtxFracBitsArray[i] = rate_estimator.m_gtxFracBits[i];
+    }
   }
   depquant_state_init(&dep_quant_context.m_startState, rate_estimator.m_sigFracBits[0][0], rate_estimator.m_gtxFracBits[0]);
   dep_quant_context.m_startState.effHeight = effectHeight;
   dep_quant_context.m_startState.effWidth = effectWidth;
+  dep_quant_context.m_startState.m_stateId = 0;
+  dep_quant_context.m_startState.m_commonCtx = &dep_quant_context.m_common_context;
+  for (int i = 0; i < (compID == COLOR_Y ? 12 : 8); ++i) {
+    dep_quant_context.m_startState.m_sigFracBitsArray[i] = rate_estimator.m_sigFracBits[0][i];
+  }
+  for (int i = 0; i < (compID == COLOR_Y ? 21 : 11); ++i) {
+    dep_quant_context.m_startState.m_gtxFracBitsArray[i] = rate_estimator.m_gtxFracBits[i];
+  }
 
 
   const uint32_t height_in_sbb = MAX(height >> 2, 1);
@@ -1292,16 +1326,15 @@ int uvg_dep_quant(
 
 void uvg_dep_quant_dequant(
   const encoder_state_t* const state,
-  const cu_info_t* const cur_tu,
-  const cu_loc_t* const cu_loc,
+  const int block_type,
+  const int width,
+  const int height,
   const color_t compID,
   coeff_t* quant_coeff,
   coeff_t * coeff, 
   bool enableScalingLists)
 {
   const encoder_control_t* const encoder = state->encoder_control;
-  const uint32_t  width = compID == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
-  const uint32_t  height = compID == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
 
   const int       numCoeff = width * height;
   
@@ -1339,7 +1372,7 @@ void uvg_dep_quant_dequant(
   int  shift = IQUANT_SHIFT + 1 - qpPer - transformShift + (enableScalingLists ? 4 : 0);
   int  invQScale = uvg_g_inv_quant_scales[needs_block_size_trafo_scale ? 1 : 0][qpRem];
   int  add = (shift < 0) ? 0 : ((1 << shift) >> 1);
-  int32_t scalinglist_type = (cur_tu->type == CU_INTRA ? 0 : 3) + (int8_t)(compID);
+  int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)(compID);
 
   const int32_t* dequant_coef = encoder->scaling_list.de_quant_coeff[log2_tr_width][log2_tr_height][scalinglist_type][qpDQ % 6];
   //----- dequant coefficients -----
diff --git a/src/dep_quant.h b/src/dep_quant.h
index c237f81e..c3fb69a4 100644
--- a/src/dep_quant.h
+++ b/src/dep_quant.h
@@ -57,8 +57,9 @@ void uvg_dealloc_nb_info(encoder_control_t* encoder);
 
 void uvg_dep_quant_dequant(
   const encoder_state_t* const state,
-  const cu_info_t* const cur_tu,
-  const cu_loc_t* const cu_loc,
+  const int block_type,
+  const int width,
+  const int height,
   const color_t compID,
   coeff_t* quant_coeff,
   coeff_t* coeff,
@@ -67,7 +68,8 @@ void uvg_dep_quant_dequant(
 int uvg_dep_quant(
   const encoder_state_t* const state,
   const cu_info_t* const cur_tu,
-  const cu_loc_t* const cu_loc,
+  const int width,
+  const int height,
   const coeff_t* srcCoeff,
   coeff_t* coeff_out,
   const color_t compID,
diff --git a/src/rate_control.c b/src/rate_control.c
index 67570565..0660f0ac 100644
--- a/src/rate_control.c
+++ b/src/rate_control.c
@@ -795,6 +795,9 @@ static double qp_to_lambda(encoder_state_t* const state, int qp)
       state->frame->QP + 2 + frame_allocation,
       est_qp);
   }
+  if(state->encoder_control->cfg.dep_quant) {
+    est_lambda *= pow(2, 0.25 / 3.0);
+  }
 
   state->lambda = est_lambda;
   state->lambda_sqrt = sqrt(est_lambda);
@@ -820,7 +823,11 @@ static double qp_to_lambda(encoder_state_t* const state, int qp)
     // Since this value will be later combined with qp_pred, clip to half of that instead to be safe
     state->qp = CLIP(state->frame->QP + UVG_QP_DELTA_MIN / 2, state->frame->QP + UVG_QP_DELTA_MAX / 2, state->qp);
     state->qp = CLIP_TO_QP(state->qp);
-    state->lambda = qp_to_lambda(state, state->qp);
+    double to_lambda = qp_to_lambda(state, state->qp);
+    if (state->encoder_control->cfg.dep_quant) {
+      to_lambda *= pow(2, 0.25 / 3.0);
+    }
+    state->lambda = to_lambda;
     state->lambda_sqrt = sqrt(state->lambda);
     
     ctu->adjust_lambda = state->lambda;
@@ -1103,7 +1110,12 @@ void uvg_set_lcu_lambda_and_qp(encoder_state_t * const state,
       pos.x = 0;
     }
     state->qp = CLIP_TO_QP(state->frame->QP + dqp);
-    state->lambda = qp_to_lambda(state, state->qp);
+    double to_lambda = qp_to_lambda(state, state->qp);
+
+    if (state->encoder_control->cfg.dep_quant) {
+      to_lambda *= pow(2, 0.25 / 3.0);
+    }
+    state->lambda = to_lambda;
     state->lambda_sqrt = sqrt(state->lambda);
   }
   else if (ctrl->cfg.target_bitrate > 0) {
@@ -1138,6 +1150,9 @@ void uvg_set_lcu_lambda_and_qp(encoder_state_t * const state,
                   state->frame->lambda * 1.5874010519681994,
                   lambda);
     lambda = clip_lambda(lambda);
+    if (state->encoder_control->cfg.dep_quant) {
+      lambda *= pow(2, 0.25 / 3.0);
+    }
 
     state->lambda      = lambda;
     state->lambda_sqrt = sqrt(lambda);
@@ -1145,8 +1160,13 @@ void uvg_set_lcu_lambda_and_qp(encoder_state_t * const state,
 
   } else {
     state->qp          = state->frame->QP;
-    state->lambda      = state->frame->lambda;
-    state->lambda_sqrt = sqrt(state->frame->lambda);
+    double lambda = state->frame->lambda;
+
+    if (state->encoder_control->cfg.dep_quant) {
+      lambda *= pow(2, 0.25 / 3.0);
+    }
+    state->lambda      = lambda;
+    state->lambda_sqrt = sqrt(lambda);
   }
 
   lcu->lambda = state->lambda;
@@ -1170,7 +1190,11 @@ void uvg_set_lcu_lambda_and_qp(encoder_state_t * const state,
     // Since this value will be later combined with qp_pred, clip to half of that instead to be safe
     state->qp = CLIP(state->frame->QP + UVG_QP_DELTA_MIN / 2, state->frame->QP + UVG_QP_DELTA_MAX / 2, state->qp);
     state->qp = CLIP_TO_QP(state->qp);
-    state->lambda = qp_to_lambda(state, state->qp);
+    double to_lambda = qp_to_lambda(state, state->qp);
+    if (state->encoder_control->cfg.dep_quant) {
+      to_lambda *= pow(2, 0.25 / 3.0);
+    }
+    state->lambda = to_lambda;
     state->lambda_sqrt = sqrt(state->lambda);
 
     lcu->adjust_lambda = state->lambda;
diff --git a/src/strategies/avx2/quant-avx2.c b/src/strategies/avx2/quant-avx2.c
index d49b2f8f..00ef1248 100644
--- a/src/strategies/avx2/quant-avx2.c
+++ b/src/strategies/avx2/quant-avx2.c
@@ -707,8 +707,21 @@ int uvg_quantize_residual_avx2(encoder_state_t *const state,
   }
 
   // Quantize coeffs. (coeff -> coeff_out)
-  
-  if (state->encoder_control->cfg.rdoq_enable &&
+  int abs_sum = 0;
+  if(!use_trskip && state->encoder_control->cfg.dep_quant) {
+    uvg_dep_quant(
+      state,
+      cur_cu,
+      width,
+      height,
+      coeff,
+      coeff_out,
+      color,
+      tree_type,
+      &abs_sum,
+      state->encoder_control->cfg.scaling_list);
+  }
+  else if (state->encoder_control->cfg.rdoq_enable &&
       (width > 4 || !state->encoder_control->cfg.rdoq_skip) && !use_trskip)
   {
     uvg_rdoq(state, coeff, coeff_out, width, height, color,
@@ -792,6 +805,10 @@ int uvg_quantize_residual_avx2(encoder_state_t *const state,
 void uvg_dequant_avx2(const encoder_state_t * const state, coeff_t *q_coef, coeff_t *coef, int32_t width, int32_t height,color_t color, int8_t block_type, int8_t transform_skip)
 {
   const encoder_control_t * const encoder = state->encoder_control;
+  if (encoder->cfg.dep_quant) {
+    uvg_dep_quant_dequant(state, block_type, width, height, color, q_coef, coef, encoder->cfg.scaling_list);
+    return;
+  }
   int32_t shift,add,coeff_q;
   int32_t n;
   const uint32_t log2_tr_width =  uvg_g_convert_to_log2[width];
diff --git a/src/strategies/generic/quant-generic.c b/src/strategies/generic/quant-generic.c
index ed30b691..6a7d8990 100644
--- a/src/strategies/generic/quant-generic.c
+++ b/src/strategies/generic/quant-generic.c
@@ -316,8 +316,21 @@ int uvg_quant_cbcr_residual_generic(
   if(lfnst_idx) {
     uvg_fwd_lfnst(cur_cu, width, height, COLOR_UV, lfnst_idx, coeff, tree_type, state->collocated_luma_mode);
   }
-
-  if (state->encoder_control->cfg.rdoq_enable &&
+  int abs_sum = 0;
+  if (!false && state->encoder_control->cfg.dep_quant) {
+    uvg_dep_quant(
+      state,
+      cur_cu,
+      width,
+      height,
+      coeff,
+      coeff_out,
+      COLOR_U,
+      tree_type,
+      &abs_sum,
+      state->encoder_control->cfg.scaling_list);
+  }
+  else if (state->encoder_control->cfg.rdoq_enable &&
     (width > 4 || !state->encoder_control->cfg.rdoq_skip))
   {
     uvg_rdoq(state, coeff, coeff_out, width, height, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U,
@@ -497,7 +510,21 @@ int uvg_quantize_residual_generic(encoder_state_t *const state,
 
   // Quantize coeffs. (coeff -> coeff_out)
   
-  if (state->encoder_control->cfg.rdoq_enable &&
+  int abs_sum = 0;
+  if (!false && state->encoder_control->cfg.dep_quant) {
+    uvg_dep_quant(
+      state,
+      cur_cu,
+      width,
+      height,
+      coeff,
+      coeff_out,
+      COLOR_U,
+      tree_type,
+      &abs_sum,
+      state->encoder_control->cfg.scaling_list);
+  }
+  else if (state->encoder_control->cfg.rdoq_enable &&
       (width > 4 || !state->encoder_control->cfg.rdoq_skip) && !use_trskip)
   {
     uvg_rdoq(state, coeff, coeff_out, width, height, color,
@@ -591,6 +618,10 @@ int uvg_quantize_residual_generic(encoder_state_t *const state,
 void uvg_dequant_generic(const encoder_state_t * const state, coeff_t *q_coef, coeff_t *coef, int32_t width, int32_t height,color_t color, int8_t block_type, int8_t transform_skip)
 {
   const encoder_control_t * const encoder = state->encoder_control;
+  if(encoder->cfg.dep_quant) {
+    uvg_dep_quant_dequant(state, block_type, width, height, color, q_coef, coef, encoder->cfg.scaling_list);
+    return;
+  }
   int32_t shift,add,coeff_q;
   int32_t n;
   const uint32_t log2_tr_width  = uvg_g_convert_to_log2[width];

From 5abe9e57c6dd23e847d03328291ed2a6434727bf Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Thu, 12 Jan 2023 15:05:37 +0200
Subject: [PATCH 186/254] [DepQuant] Working but not necessarily improving

---
 src/dep_quant.c                               | 69 ++++++++++++-------
 .../generic/encode_coding_tree-generic.c      |  2 +-
 2 files changed, 45 insertions(+), 26 deletions(-)

diff --git a/src/dep_quant.c b/src/dep_quant.c
index 70f0e28d..3c78b71f 100644
--- a/src/dep_quant.c
+++ b/src/dep_quant.c
@@ -729,7 +729,7 @@ static void xDecide(
 }
 
 
-unsigned templateAbsCompare(coeff_t sum)
+static INLINE unsigned templateAbsCompare(coeff_t sum)
 {
     int rangeIdx = 0;
     if (sum < g_riceT[0])
@@ -758,9 +758,11 @@ unsigned templateAbsCompare(coeff_t sum)
 static INLINE void update_common_context(
   common_context * cc,
   const uint32_t scan_pos,
+  const uint32_t cg_pos,
   const uint32_t width_in_sbb,
   const uint32_t height_in_sbb,
-  const int sigNSbb,
+  const uint32_t next_sbb_right,
+  const uint32_t next_sbb_below,
   const depquant_state* prevState,
   depquant_state *currState)
 {
@@ -776,9 +778,10 @@ static INLINE void update_common_context(
     memset(sbbFlags, 0, numSbb * sizeof(uint8_t));
     memset(levels + scan_pos, 0, setCpSize);
   }
-  sbbFlags[scan_pos >> 4] = !!currState->m_numSigSbb;
+  sbbFlags[cg_pos] = !!currState->m_numSigSbb;
   memcpy(levels + scan_pos, currState->m_absLevelsAndCtxInit, 16 * sizeof(uint8_t));
 
+  const int       sigNSbb = ((next_sbb_right ? sbbFlags[next_sbb_right] : false) || (next_sbb_below ? sbbFlags[next_sbb_below] : false) ? 1 : 0);
   currState->m_numSigSbb = 0;
   if (prevState) {
     currState->m_remRegBins = prevState->m_remRegBins;
@@ -829,11 +832,13 @@ static INLINE void update_common_context(
 static INLINE void updateStateEOS(
   depquant_state * state, 
   const uint32_t  scan_pos,
+  const uint32_t  cg_pos,
   const uint32_t sigCtxOffsetNext,
   const uint32_t gtxCtxOffsetNext,
   const uint32_t width_in_sbb,
   const uint32_t height_in_sbb,
-  const uint32_t sigNSbb,
+  const uint32_t next_sbb_right,
+  const uint32_t next_sbb_below,
   const depquant_state* prevStates,
   const depquant_state* skipStates,
   const Decision *decision)
@@ -862,7 +867,7 @@ static INLINE void updateStateEOS(
       uint8_t* temp = (uint8_t*)(&state->m_absLevelsAndCtxInit[scan_pos & 15]);
       *temp = (uint8_t)MIN(255, decision->absLevel);
 
-      update_common_context(state->m_commonCtx, scan_pos, width_in_sbb, height_in_sbb, sigNSbb, prvState, state);
+      update_common_context(state->m_commonCtx, scan_pos, cg_pos, width_in_sbb, height_in_sbb, next_sbb_right, next_sbb_below,prvState, state);
       
       coeff_t  tinit = state->m_absLevelsAndCtxInit[8 + ((scan_pos - 1) & 15)];
       coeff_t  sumNum = tinit & 7;
@@ -1063,13 +1068,15 @@ static void xDecideAndUpdate(
   context_store* ctxs,
   const coeff_t absCoeff,
   const uint32_t scan_pos,
+  const uint32_t cg_pos,
   const uint32_t pos_x,
   const uint32_t pos_y,
   const uint32_t sigCtxOffsetNext,
   const uint32_t gtxCtxOffsetNext,
   const uint32_t width_in_sbb,
   const uint32_t height_in_sbb,
-  const uint32_t sigNSbb,
+  const uint32_t next_sbb_right,
+  const uint32_t next_sbb_below,
   const NbInfoSbb next_nb_info_ssb,
   bool zeroOut,
   coeff_t quantCoeff,
@@ -1094,10 +1101,10 @@ static void xDecideAndUpdate(
   if (scan_pos) {
     if (!(scan_pos & 15)) {
       SWAP(ctxs->m_common_context.m_currSbbCtx, ctxs->m_common_context.m_prevSbbCtx, SbbCtx*);
-      updateStateEOS(&ctxs->m_currStates[0], scan_pos, sigCtxOffsetNext, gtxCtxOffsetNext, width_in_sbb, height_in_sbb, sigNSbb, ctxs->m_prevStates, ctxs->m_skipStates, &decisions[0]);
-      updateStateEOS(&ctxs->m_currStates[1], scan_pos, sigCtxOffsetNext, gtxCtxOffsetNext, width_in_sbb, height_in_sbb, sigNSbb, ctxs->m_prevStates, ctxs->m_skipStates, &decisions[1]);
-      updateStateEOS(&ctxs->m_currStates[2], scan_pos, sigCtxOffsetNext, gtxCtxOffsetNext, width_in_sbb, height_in_sbb, sigNSbb, ctxs->m_prevStates, ctxs->m_skipStates, &decisions[2]);
-      updateStateEOS(&ctxs->m_currStates[3], scan_pos, sigCtxOffsetNext, gtxCtxOffsetNext, width_in_sbb, height_in_sbb, sigNSbb, ctxs->m_prevStates, ctxs->m_skipStates, &decisions[3]);
+      updateStateEOS(&ctxs->m_currStates[0], scan_pos, cg_pos, sigCtxOffsetNext, gtxCtxOffsetNext, width_in_sbb, height_in_sbb, next_sbb_right, next_sbb_below, ctxs->m_prevStates, ctxs->m_skipStates, &decisions[0]);
+      updateStateEOS(&ctxs->m_currStates[1], scan_pos, cg_pos, sigCtxOffsetNext, gtxCtxOffsetNext, width_in_sbb, height_in_sbb, next_sbb_right, next_sbb_below, ctxs->m_prevStates, ctxs->m_skipStates, &decisions[1]);
+      updateStateEOS(&ctxs->m_currStates[2], scan_pos, cg_pos, sigCtxOffsetNext, gtxCtxOffsetNext, width_in_sbb, height_in_sbb, next_sbb_right, next_sbb_below, ctxs->m_prevStates, ctxs->m_skipStates, &decisions[2]);
+      updateStateEOS(&ctxs->m_currStates[3], scan_pos, cg_pos, sigCtxOffsetNext, gtxCtxOffsetNext, width_in_sbb, height_in_sbb, next_sbb_right, next_sbb_below, ctxs->m_prevStates, ctxs->m_skipStates, &decisions[3]);
       memcpy(decisions + 4, decisions, 4 * sizeof(Decision));
     } else if (!zeroOut) {
 
@@ -1218,10 +1225,9 @@ int uvg_dep_quant(
     dep_quant_context.m_allStates[k].effHeight = effectHeight;
     dep_quant_context.m_allStates[k].effWidth = effectWidth;
     dep_quant_context.m_allStates[k].m_commonCtx = &dep_quant_context.m_common_context;
-    int i1 = (k & 3) ? (k & 3) - 1 : 0;
-    dep_quant_context.m_allStates[k].m_stateId = i1;
+    dep_quant_context.m_allStates[k].m_stateId = k & 3;
     for (int i = 0; i < (compID == COLOR_Y ? 12 : 8); ++i) {
-      dep_quant_context.m_allStates[k].m_sigFracBitsArray[i] = rate_estimator.m_sigFracBits[i1][i];
+      dep_quant_context.m_allStates[k].m_sigFracBitsArray[i] = rate_estimator.m_sigFracBits[(k & 3 ? (k & 3) - 1 : 0)][i];
     }
     for (int i = 0; i < (compID == COLOR_Y ? 21 : 11); ++i) {
       dep_quant_context.m_allStates[k].m_gtxFracBitsArray[i] = rate_estimator.m_gtxFracBits[i];
@@ -1247,17 +1253,21 @@ int uvg_dep_quant(
     uint32_t blkpos = scan[scanIdx];
     uint32_t  pos_y = blkpos >> log2_tr_width;
     uint32_t  pos_x = blkpos - (pos_y << log2_tr_width);
+    uint32_t cg_pos = cg_scan[scanIdx >> 4];
 
-    uint32_t cg_blockpos = scanIdx ? cg_scan[(scanIdx -1) >> 4] : 0;
-    uint32_t cg_pos_y = cg_blockpos / height_in_sbb;
-    uint32_t cg_pos_x = cg_blockpos - (cg_pos_y * height_in_sbb);
-    uint32_t diag = cg_pos_y + cg_pos_x;
+    uint32_t blkpos_next = scan[scanIdx ? scanIdx - 1 : 0];
+    uint32_t  pos_y_next = blkpos_next >> log2_tr_width;
+    uint32_t  pos_x_next = blkpos_next - (pos_y << log2_tr_width);
+    uint32_t cg_blockpos_next = scanIdx ? cg_scan[(scanIdx -1) >> 4] : 0;
+    uint32_t cg_pos_y_next = cg_blockpos_next / height_in_sbb;
+    uint32_t cg_pos_x_next = cg_blockpos_next - (cg_pos_y_next * height_in_sbb);
+    uint32_t diag = pos_y_next + pos_x_next;
 
     uint32_t sig_ctx_offset = compID == COLOR_Y ? (diag < 2 ? 8 : diag < 5 ? 4 : 0) : (diag < 2 ? 4 : 0);
     uint32_t gtx_ctx_offset = compID == COLOR_Y ? (diag < 1 ? 16 : diag < 3 ? 11 : diag < 10 ? 6 : 1) : (diag < 1 ? 6 : 1);
 
-    uint32_t nextSbbRight = (cg_pos_x < width_in_sbb - 1 ? cg_blockpos + 1 : 0);
-    uint32_t nextSbbBelow = (cg_pos_y < height_in_sbb - 1 ? cg_blockpos + width_in_sbb : 0);
+    uint32_t nextSbbRight = (cg_pos_x_next < width_in_sbb - 1 ? cg_blockpos_next + 1 : 0);
+    uint32_t nextSbbBelow = (cg_pos_y_next < height_in_sbb - 1 ? cg_blockpos_next + width_in_sbb : 0);
 
     if (enableScalingLists) {
       init_quant_block(state, &dep_quant_context.m_quant, cur_tu, log2_tr_width, log2_tr_height, compID, needs_block_size_trafo_scale, q_coeff[blkpos]);
@@ -1267,38 +1277,47 @@ int uvg_dep_quant(
         &dep_quant_context,
         abs(srcCoeff[blkpos]),
         scanIdx,
+        cg_pos,
         pos_x,
         pos_y,
         sig_ctx_offset,
         gtx_ctx_offset,
         width_in_sbb,
         height_in_sbb,
-        nextSbbRight || nextSbbBelow,
+        nextSbbRight,
+        nextSbbBelow,
         encoder->m_scanId2NbInfoSbbArray[log2_tr_width][log2_tr_height][scanIdx ? scanIdx - 1 : 0],
         (zeroOut && (pos_x >= effWidth || pos_y >= effHeight)),
         q_coeff[blkpos],
         effectWidth,
         effectHeight
-        ); //tu.cu->slice->getReverseLastSigCoeffFlag());
-    } else {
+      ); //tu.cu->slice->getReverseLastSigCoeffFlag());
+    }
+    else {
       xDecideAndUpdate(
         &rate_estimator,
         &dep_quant_context,
         abs(srcCoeff[blkpos]),
         scanIdx,
+        cg_pos,
         pos_x,
         pos_y,
         sig_ctx_offset,
         gtx_ctx_offset,
         width_in_sbb,
         height_in_sbb,
-        nextSbbRight || nextSbbBelow,
+        nextSbbRight,
+        nextSbbBelow,
         encoder->m_scanId2NbInfoSbbArray[log2_tr_width][log2_tr_height][scanIdx ? scanIdx - 1 : 0],
         (zeroOut && (pos_x >= effWidth || pos_y >= effHeight)),
         default_quant_coeff,
         effectWidth,
         effectHeight); //tu.cu->slice->getReverseLastSigCoeffFlag());
-      }
+    }
+    Decision* d = dep_quant_context.m_trellis[scanIdx];
+    Decision temp[8];
+    memcpy(temp, d, sizeof(Decision) * 8);
+    d++;
   }
 
   //===== find best path =====
@@ -1361,7 +1380,7 @@ void uvg_dep_quant_dequant(
   }
 
   //----- set dequant parameters -----
-  const int         qpDQ = uvg_get_scaled_qp(compID, state->qp, (encoder->bitdepth - 8) * 6, encoder->qp_map[0]); + 1;
+  const int         qpDQ = uvg_get_scaled_qp(compID, state->qp, (encoder->bitdepth - 8) * 6, encoder->qp_map[0]) + 1;
   const int         qpPer = qpDQ / 6;
   const int         qpRem = qpDQ - 6 * qpPer;
   const int         channelBitDepth = encoder->bitdepth;
diff --git a/src/strategies/generic/encode_coding_tree-generic.c b/src/strategies/generic/encode_coding_tree-generic.c
index 21854920..c3065903 100644
--- a/src/strategies/generic/encode_coding_tree-generic.c
+++ b/src/strategies/generic/encode_coding_tree-generic.c
@@ -133,7 +133,7 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
 
 
 
-  uint32_t quant_state_transition_table = 0; //ToDo: dep quant enable changes this
+  uint32_t quant_state_transition_table = state->encoder_control->cfg.dep_quant ? 32040 : 0; 
   int32_t quant_state = 0;
   uint8_t  ctx_offset[16];
   int32_t temp_diag = -1;

From c6087230a8368cf9e3124f9c26629afb33a6e3d9 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Tue, 17 Jan 2023 11:03:14 +0200
Subject: [PATCH 187/254] [DepQuant] Fix

---
 src/dep_quant.c                        | 11 ++++-------
 src/strategies/generic/quant-generic.c |  2 +-
 2 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/src/dep_quant.c b/src/dep_quant.c
index 3c78b71f..d599ae08 100644
--- a/src/dep_quant.c
+++ b/src/dep_quant.c
@@ -1257,7 +1257,7 @@ int uvg_dep_quant(
 
     uint32_t blkpos_next = scan[scanIdx ? scanIdx - 1 : 0];
     uint32_t  pos_y_next = blkpos_next >> log2_tr_width;
-    uint32_t  pos_x_next = blkpos_next - (pos_y << log2_tr_width);
+    uint32_t  pos_x_next = blkpos_next - (pos_y_next << log2_tr_width);
     uint32_t cg_blockpos_next = scanIdx ? cg_scan[(scanIdx -1) >> 4] : 0;
     uint32_t cg_pos_y_next = cg_blockpos_next / height_in_sbb;
     uint32_t cg_pos_x_next = cg_blockpos_next - (cg_pos_y_next * height_in_sbb);
@@ -1269,12 +1269,13 @@ int uvg_dep_quant(
     uint32_t nextSbbRight = (cg_pos_x_next < width_in_sbb - 1 ? cg_blockpos_next + 1 : 0);
     uint32_t nextSbbBelow = (cg_pos_y_next < height_in_sbb - 1 ? cg_blockpos_next + width_in_sbb : 0);
 
+    context_store* ctxs = &dep_quant_context;
     if (enableScalingLists) {
       init_quant_block(state, &dep_quant_context.m_quant, cur_tu, log2_tr_width, log2_tr_height, compID, needs_block_size_trafo_scale, q_coeff[blkpos]);
 
       xDecideAndUpdate(
         &rate_estimator,
-        &dep_quant_context,
+        ctxs,
         abs(srcCoeff[blkpos]),
         scanIdx,
         cg_pos,
@@ -1296,7 +1297,7 @@ int uvg_dep_quant(
     else {
       xDecideAndUpdate(
         &rate_estimator,
-        &dep_quant_context,
+        ctxs,
         abs(srcCoeff[blkpos]),
         scanIdx,
         cg_pos,
@@ -1314,10 +1315,6 @@ int uvg_dep_quant(
         effectWidth,
         effectHeight); //tu.cu->slice->getReverseLastSigCoeffFlag());
     }
-    Decision* d = dep_quant_context.m_trellis[scanIdx];
-    Decision temp[8];
-    memcpy(temp, d, sizeof(Decision) * 8);
-    d++;
   }
 
   //===== find best path =====
diff --git a/src/strategies/generic/quant-generic.c b/src/strategies/generic/quant-generic.c
index 6a7d8990..b8db045f 100644
--- a/src/strategies/generic/quant-generic.c
+++ b/src/strategies/generic/quant-generic.c
@@ -519,7 +519,7 @@ int uvg_quantize_residual_generic(encoder_state_t *const state,
       height,
       coeff,
       coeff_out,
-      COLOR_U,
+      color,
       tree_type,
       &abs_sum,
       state->encoder_control->cfg.scaling_list);

From 505c26eef3abc9f1d09ee2d4a1ac5fb2514e9cde Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Tue, 17 Jan 2023 11:03:14 +0200
Subject: [PATCH 188/254] [DepQuant] Fix

---
 src/strategies/avx2/quant-avx2.c       | 2 +-
 src/strategies/generic/quant-generic.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/strategies/avx2/quant-avx2.c b/src/strategies/avx2/quant-avx2.c
index 00ef1248..7729d272 100644
--- a/src/strategies/avx2/quant-avx2.c
+++ b/src/strategies/avx2/quant-avx2.c
@@ -805,7 +805,7 @@ int uvg_quantize_residual_avx2(encoder_state_t *const state,
 void uvg_dequant_avx2(const encoder_state_t * const state, coeff_t *q_coef, coeff_t *coef, int32_t width, int32_t height,color_t color, int8_t block_type, int8_t transform_skip)
 {
   const encoder_control_t * const encoder = state->encoder_control;
-  if (encoder->cfg.dep_quant) {
+  if (encoder->cfg.dep_quant && !transform_skip) {
     uvg_dep_quant_dequant(state, block_type, width, height, color, q_coef, coef, encoder->cfg.scaling_list);
     return;
   }
diff --git a/src/strategies/generic/quant-generic.c b/src/strategies/generic/quant-generic.c
index b8db045f..ceb6b7aa 100644
--- a/src/strategies/generic/quant-generic.c
+++ b/src/strategies/generic/quant-generic.c
@@ -511,7 +511,7 @@ int uvg_quantize_residual_generic(encoder_state_t *const state,
   // Quantize coeffs. (coeff -> coeff_out)
   
   int abs_sum = 0;
-  if (!false && state->encoder_control->cfg.dep_quant) {
+  if (!use_trskip && state->encoder_control->cfg.dep_quant) {
     uvg_dep_quant(
       state,
       cur_cu,
@@ -618,7 +618,7 @@ int uvg_quantize_residual_generic(encoder_state_t *const state,
 void uvg_dequant_generic(const encoder_state_t * const state, coeff_t *q_coef, coeff_t *coef, int32_t width, int32_t height,color_t color, int8_t block_type, int8_t transform_skip)
 {
   const encoder_control_t * const encoder = state->encoder_control;
-  if(encoder->cfg.dep_quant) {
+  if(encoder->cfg.dep_quant && !transform_skip) {
     uvg_dep_quant_dequant(state, block_type, width, height, color, q_coef, coef, encoder->cfg.scaling_list);
     return;
   }

From dc652c75f9951caee003ab96f77311dabbf4a476 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Thu, 19 Jan 2023 16:30:47 +0200
Subject: [PATCH 189/254] [DepQuant] Isp and chroma

---
 src/cu.h        |  2 ++
 src/dep_quant.c | 24 ++++++++----------------
 src/intra.c     |  3 ++-
 src/transform.c | 47 ++++++++++++++++++++++++++++++++++++++++++++---
 4 files changed, 56 insertions(+), 20 deletions(-)

diff --git a/src/cu.h b/src/cu.h
index e3668f19..843fe582 100644
--- a/src/cu.h
+++ b/src/cu.h
@@ -169,6 +169,8 @@ typedef struct
       int8_t mip_flag;
       int8_t mip_is_transposed;
       int8_t isp_mode;
+      uint8_t isp_cbfs : 4;
+      uint8_t isp_index : 2;
     } intra;
     struct {
       mv_t    mv[2][2];  // \brief Motion vectors for L0 and L1
diff --git a/src/dep_quant.c b/src/dep_quant.c
index d599ae08..b5fe6c55 100644
--- a/src/dep_quant.c
+++ b/src/dep_quant.c
@@ -440,7 +440,6 @@ static void xSetLastCoeffOffset(
   const int width,
   const int height,
   rate_estimator* rate_estimator,
-  const bool cb_cbf,
   const color_t compID)
 {
   int32_t cbfDeltaBits = 0;
@@ -451,25 +450,18 @@ static void xSetLastCoeffOffset(
     bool        lastCbfIsInferred     = false;
     bool        useIntraSubPartitions = cur_tu->type == CU_INTRA && cur_tu->intra.isp_mode && compID == COLOR_Y;
     if (useIntraSubPartitions) {
-      bool     rootCbfSoFar       = false;
-      bool     isLastSubPartition = false; //TODO: isp check
       uint32_t nTus = uvg_get_isp_split_num(width, height, cur_tu->intra.isp_mode, true);
+      bool     isLastSubPartition = cur_tu->intra.isp_index +1 == nTus; //TODO: isp check
       if (isLastSubPartition) {
-        //TransformUnit* tuPointer = tu.cu->firstTU;
-        //for (int tuIdx = 0; tuIdx < nTus - 1; tuIdx++) {
-        //  rootCbfSoFar |= TU::getCbfAtDepth(*tuPointer, COMPONENT_Y, tu.depth);
-        //  tuPointer = tuPointer->next;
-        //}
-        if (!rootCbfSoFar) {
-          lastCbfIsInferred = true;
-        }
+        lastCbfIsInferred = cur_tu->intra.isp_cbfs == 0;
       }
       if (!lastCbfIsInferred) {
-        prevLumaCbf = false;
+        prevLumaCbf = cur_tu->intra.isp_index != 0 && (cur_tu->intra.isp_cbfs & (1 << (cur_tu->intra.isp_index - 1)));
       }
       const cabac_ctx_t * const cbf_ctx = &state->search_cabac.ctx.qt_cbf_model_luma[2 + prevLumaCbf];
       cbfDeltaBits = lastCbfIsInferred ? 0 : (int32_t)CTX_ENTROPY_BITS(cbf_ctx, 1) - (int32_t)CTX_ENTROPY_BITS(cbf_ctx, 0);
-    } else {
+    }
+    else {
       const cabac_ctx_t* cbf_ctx;
       switch (compID) {
         case COLOR_Y:
@@ -479,7 +471,7 @@ static void xSetLastCoeffOffset(
           cbf_ctx = &state->search_cabac.ctx.qt_cbf_model_cb[0];
           break;
         case COLOR_V:
-          cbf_ctx = &state->search_cabac.ctx.qt_cbf_model_cr[cb_cbf];
+          cbf_ctx = &state->search_cabac.ctx.qt_cbf_model_cr[cbf_is_set(cur_tu->cbf, COLOR_U)];
           break;
       }
       cbfDeltaBits = (int32_t)CTX_ENTROPY_BITS(cbf_ctx, 1) - (int32_t)CTX_ENTROPY_BITS(cbf_ctx, 0);
@@ -1182,7 +1174,7 @@ int uvg_dep_quant(
   int             effWidth = width, effHeight = height;
   if (
     (is_mts ||
-     (state->encoder_control->cfg.mts && 1 /*sbt not used by block*/ &&
+     (state->encoder_control->cfg.mts && 0 /*sbt used by block*/ &&
       height <= 32 && width <= 32)) &&
     compID == COLOR_Y) {
     effHeight = (height == 32) ? 16 : height;
@@ -1212,7 +1204,7 @@ int uvg_dep_quant(
   //===== real init =====
   rate_estimator rate_estimator;
   init_rate_esimator(&rate_estimator, &state->search_cabac, compID);
-  xSetLastCoeffOffset(state, cur_tu, width, height, &rate_estimator, cbf_is_set(cur_tu->cbf, COLOR_U), compID);
+  xSetLastCoeffOffset(state, cur_tu, width, height, &rate_estimator, compID);
 
   reset_common_context(&dep_quant_context.m_common_context, &rate_estimator, (width * height) >> 4, numCoeff);
   dep_quant_context.m_common_context.m_nbInfo = encoder->m_scanId2NbInfoOutArray[log2_tr_width][log2_tr_height];
diff --git a/src/intra.c b/src/intra.c
index 0217ed7c..d3241b34 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -1941,7 +1941,7 @@ void uvg_intra_recon_cu(
       uvg_get_isp_split_loc(&tu_loc,  cu_loc->x, cu_loc->y, width, height, i, split_type, true);
       cu_loc_t pu_loc;
       uvg_get_isp_split_loc(&pu_loc, cu_loc->x, cu_loc->y, width, height, i, split_type, false);
-
+      cur_cu->intra.isp_index = 0;
       if(tu_loc.x % 4 == 0) {
         intra_recon_tb_leaf(state, &pu_loc, cu_loc, lcu, COLOR_Y, search_data, tree_type);
       }
@@ -1949,6 +1949,7 @@ void uvg_intra_recon_cu(
         &tu_loc, cur_cu, lcu,
         false, tree_type);
       search_data->best_isp_cbfs |= cbf_is_set(cur_cu->cbf, COLOR_Y) << i;
+      cur_cu->intra.isp_cbfs = search_data->best_isp_cbfs;
     }
   }
   const bool has_luma = recon_luma && search_data->pred_cu.intra.isp_mode == ISP_MODE_NO_ISP;
diff --git a/src/transform.c b/src/transform.c
index 34e246ce..2ee2fc32 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -417,6 +417,7 @@ static void generate_jccr_transforms(
 
 static void quantize_chroma(
   encoder_state_t* const state,
+  cu_info_t * const cur_tu,
   int8_t width,
   int8_t height,
   coeff_t u_coeff[5120],
@@ -427,8 +428,48 @@ static void quantize_chroma(
   const coeff_scan_order_t scan_order,
   bool* u_has_coeffs,
   bool* v_has_coeffs,
-  uint8_t lfnst_idx)
+  uint8_t lfnst_idx, 
+  enum uvg_tree_type tree_type)
 {
+  if(state->encoder_control->cfg.dep_quant && transform != CHROMA_TS) {
+    int abs_sum = 0;
+    uvg_dep_quant(
+      state,
+      cur_tu,
+      width,
+      height,
+      u_coeff,
+      u_quant_coeff,
+      COLOR_U,
+      tree_type,
+      &abs_sum,
+      state->encoder_control->cfg.scaling_list
+    );
+    if (abs_sum > 0) {
+      *u_has_coeffs = 1;
+      cbf_set(&cur_tu->cbf, COLOR_U);
+    }
+    if (transform == DCT7_CHROMA) {
+      abs_sum = 0;
+      uvg_dep_quant(
+        state,
+        cur_tu,
+        width,
+        height,
+        v_coeff,
+        v_quant_coeff,
+        COLOR_V,
+        tree_type,
+        &abs_sum,
+        state->encoder_control->cfg.scaling_list
+      );
+      if (abs_sum > 0) {
+        *v_has_coeffs = 1;
+      }
+      cbf_clear(&cur_tu->cbf, COLOR_U);
+    }
+    return;
+  }
   if (state->encoder_control->cfg.rdoq_enable &&
     (transform != CHROMA_TS || !state->encoder_control->cfg.rdoq_skip))
   {
@@ -561,6 +602,7 @@ void uvg_chroma_transform_search(
     }
     quantize_chroma(
       state,
+      pred_cu,
       width,
       height,
       &u_coeff[i * trans_offset],
@@ -570,8 +612,7 @@ void uvg_chroma_transform_search(
       v_quant_coeff,
       SCAN_DIAG,
       &u_has_coeffs,
-      &v_has_coeffs,
-      tree_type == UVG_CHROMA_T ?  pred_cu->cr_lfnst_idx : pred_cu->lfnst_idx);
+      &v_has_coeffs, tree_type == UVG_CHROMA_T ?  pred_cu->cr_lfnst_idx : pred_cu->lfnst_idx, tree_type);
     if(pred_cu->cr_lfnst_idx !=0 && !u_has_coeffs && !v_has_coeffs) continue;
     
     if(pred_cu->type == CU_INTRA && transforms[i] != CHROMA_TS && tree_type == UVG_CHROMA_T) {

From 93c7e9c2969de3e322edce4f97cb5fd30e3b0f9a Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Mon, 23 Jan 2023 13:39:22 +0200
Subject: [PATCH 190/254] [DepQuant] Fix for mts and lfnst being quantized
 incorrectly during search

---
 src/dep_quant.c    |  4 ++--
 src/search_intra.c | 14 +++++++-------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/dep_quant.c b/src/dep_quant.c
index b5fe6c55..cd461815 100644
--- a/src/dep_quant.c
+++ b/src/dep_quant.c
@@ -1145,7 +1145,7 @@ int uvg_dep_quant(
   const bool      is_mts   = compID == COLOR_Y && cur_tu->tr_idx > MTS_SKIP;
   const bool      is_ts    = cur_tu->tr_skip >> compID & 1;
 
-    const uint32_t  log2_tr_width  = uvg_g_convert_to_log2[width];
+  const uint32_t  log2_tr_width  = uvg_g_convert_to_log2[width];
   const uint32_t  log2_tr_height = uvg_g_convert_to_log2[height];
   const uint32_t* const scan     = uvg_get_scan_order_table(SCAN_GROUP_4X4,0,log2_tr_width,log2_tr_height);
   const uint32_t* const cg_scan     = uvg_get_scan_order_table(SCAN_GROUP_UNGROUPED,0,log2_tr_width,log2_tr_height);
@@ -1155,7 +1155,7 @@ int uvg_dep_quant(
   bool needs_block_size_trafo_scale = is_ts && ((log2_tr_height + log2_tr_width) % 2 == 1);
   needs_block_size_trafo_scale |= 0; // Non log2 block size
 
-    const int32_t scalinglist_type = (cur_tu->type == CU_INTRA ? 0 : 3) + (int8_t)compID;
+  const int32_t scalinglist_type = (cur_tu->type == CU_INTRA ? 0 : 3) + (int8_t)compID;
   const int32_t *q_coeff = encoder->scaling_list.quant_coeff[log2_tr_width][log2_tr_height][scalinglist_type][qp_scaled % 6];
   const int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_height + log2_tr_width) >> 1) - needs_block_size_trafo_scale; //!< Represents scaling through forward transform
   const int64_t q_bits = QUANT_SHIFT + qp_scaled / 6 + (is_ts ? 0 : transform_shift );
diff --git a/src/search_intra.c b/src/search_intra.c
index f05aa208..8067772a 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -390,7 +390,12 @@ static double search_intra_trdepth(
         if (pred_cu->lfnst_idx > 0 && pred_cu->tr_idx > 0) {
           continue;
         }
-        
+
+        if (!has_been_split) {
+          memcpy(&state->search_cabac, &cabac_data, sizeof(cabac_data));
+          state->search_cabac.update = 1;
+        }
+
         uvg_intra_recon_cu(
           state,
           search_data,
@@ -435,12 +440,7 @@ static double search_intra_trdepth(
             continue;
           }
         }
-
-        if (!has_been_split) {
-          memcpy(&state->search_cabac, &cabac_data, sizeof(cabac_data));
-          state->search_cabac.update = 1;
-        }
-
+        
         double rd_cost = uvg_cu_rd_cost_luma(
           state,
           cu_loc,

From 6e24b9a7f982e2054cd9059e72db08c23ff87406 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Tue, 31 Jan 2023 14:26:00 +0200
Subject: [PATCH 191/254] [DepQuant] Fix isp+depquant and trskip + isp

---
 src/dep_quant.c    |  6 ++--
 src/intra.c        | 61 ++++++++++++++++++++++++++++++++--
 src/intra.h        |  6 ++++
 src/search.c       | 38 ++++++++++++++++++----
 src/search_intra.c | 81 ++++++++++++++++++++++++++--------------------
 5 files changed, 145 insertions(+), 47 deletions(-)

diff --git a/src/dep_quant.c b/src/dep_quant.c
index cd461815..80a5c179 100644
--- a/src/dep_quant.c
+++ b/src/dep_quant.c
@@ -450,7 +450,7 @@ static void xSetLastCoeffOffset(
     bool        lastCbfIsInferred     = false;
     bool        useIntraSubPartitions = cur_tu->type == CU_INTRA && cur_tu->intra.isp_mode && compID == COLOR_Y;
     if (useIntraSubPartitions) {
-      uint32_t nTus = uvg_get_isp_split_num(width, height, cur_tu->intra.isp_mode, true);
+      uint32_t nTus = uvg_get_isp_split_num(1 << cur_tu->log2_width, 1 << cur_tu->log2_height, cur_tu->intra.isp_mode, true);
       bool     isLastSubPartition = cur_tu->intra.isp_index +1 == nTus; //TODO: isp check
       if (isLastSubPartition) {
         lastCbfIsInferred = cur_tu->intra.isp_cbfs == 0;
@@ -479,7 +479,7 @@ static void xSetLastCoeffOffset(
      
   }
 
-static const unsigned prefixCtx[] = {0, 0, 0, 3, 6, 10, 15, 21};
+  static const unsigned prefixCtx[] = {0, 0, 0, 3, 6, 10, 15, 21};
   uint32_t              ctxBits[14];
   for (unsigned xy = 0; xy < 2; xy++) {
     int32_t        bitOffset  = (xy ? cbfDeltaBits : 0);
@@ -1143,7 +1143,7 @@ int uvg_dep_quant(
   *absSum                    = 0;
 
   const bool      is_mts   = compID == COLOR_Y && cur_tu->tr_idx > MTS_SKIP;
-  const bool      is_ts    = cur_tu->tr_skip >> compID & 1;
+  const bool      is_ts    = (cur_tu->tr_skip >> compID) & 1;
 
   const uint32_t  log2_tr_width  = uvg_g_convert_to_log2[width];
   const uint32_t  log2_tr_height = uvg_g_convert_to_log2[height];
diff --git a/src/intra.c b/src/intra.c
index d3241b34..026254e1 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -37,8 +37,10 @@
 #include "image.h"
 #include "uvg_math.h"
 #include "mip_data.h"
+#include "rdo.h"
 #include "search.h"
 #include "search_intra.h"
+#include "strategies-picture.h"
 #include "strategies/strategies-intra.h"
 #include "tables.h"
 #include "transform.h"
@@ -1693,6 +1695,8 @@ int8_t uvg_get_co_located_luma_mode(
 }
 
 
+
+
 /**
 * \brief Returns ISP split partition size based on block dimensions and split type.
 *
@@ -1788,8 +1792,6 @@ static void intra_recon_tb_leaf(
   
   const int width  = color == COLOR_Y ? pu_loc->width  : pu_loc->chroma_width;
   const int height = color == COLOR_Y ? pu_loc->height : pu_loc->chroma_height;
-  int log2_width =  uvg_g_convert_to_log2[width];
-  int log2_height = uvg_g_convert_to_log2[height];
 
   const int lcu_width = LCU_WIDTH >> shift;
 
@@ -2026,3 +2028,58 @@ bool uvg_can_use_isp_with_lfnst(const int width, const int height, const int isp
   }
   return true;
 }
+
+
+double uvg_recon_and_estimate_cost_isp(encoder_state_t* const state,
+  const cu_loc_t* const cu_loc,
+  double cost_treshold,
+  intra_search_data_t* const search_data,
+  lcu_t* const lcu) {
+  assert(state->search_cabac.update && "ISP reconstruction must be done with CABAC update");
+  double cost = 0;
+
+  const int width = cu_loc->width;
+  const int height = cu_loc->height;
+
+  search_data->best_isp_cbfs = 0;
+  // ISP split is done horizontally or vertically depending on ISP mode, 2 or 4 times depending on block dimensions.
+  // Small blocks are split only twice.
+  int split_type = search_data->pred_cu.intra.isp_mode;
+  int split_limit = uvg_get_isp_split_num(width, height, split_type, true);
+
+  int cbf_context = 2;
+
+  for (int i = 0; i < split_limit; ++i) {
+    cu_loc_t tu_loc;
+    uvg_get_isp_split_loc(&tu_loc, cu_loc->x, cu_loc->y, width, height, i, split_type, true);
+    cu_loc_t pu_loc;
+    uvg_get_isp_split_loc(&pu_loc, cu_loc->x, cu_loc->y, width, height, i, split_type, false);
+    search_data->pred_cu.intra.isp_index = 0;
+    if (tu_loc.x % 4 == 0) {
+      intra_recon_tb_leaf(state, &pu_loc, cu_loc, lcu, COLOR_Y, search_data, UVG_LUMA_T);
+    }
+    uvg_quantize_lcu_residual(state, true, false, false,
+      &tu_loc, &search_data->pred_cu, lcu,
+      false, UVG_LUMA_T);
+
+    int index = cu_loc->local_y * LCU_WIDTH + cu_loc->local_x;
+    int ssd = uvg_pixels_calc_ssd(&lcu->ref.y[index], &lcu->rec.y[index],
+      LCU_WIDTH, LCU_WIDTH,
+      tu_loc.width, tu_loc.height);
+    double coeff_bits = uvg_get_coeff_cost(state, lcu->coeff.y, NULL, &tu_loc, 0, SCAN_DIAG, false, COEFF_ORDER_CU);
+
+
+    int cbf = cbf_is_set(search_data->pred_cu.cbf, COLOR_Y);
+    if (i + 1 != split_limit && search_data->best_isp_cbfs != 0) {
+      CABAC_FBITS_UPDATE(&state->search_cabac, &state->search_cabac.ctx.qt_cbf_model_luma[cbf_context], cbf, coeff_bits, "cbf_luma_isp_recon");
+    }
+    cost += ssd + coeff_bits * state->lambda;
+
+    cbf_context = 2 + cbf;
+
+    search_data->best_isp_cbfs |= cbf << i;
+    search_data->pred_cu.intra.isp_cbfs = search_data->best_isp_cbfs;
+
+  }
+  return cost;
+}
\ No newline at end of file
diff --git a/src/intra.h b/src/intra.h
index 515abc85..71de9a6a 100644
--- a/src/intra.h
+++ b/src/intra.h
@@ -152,6 +152,12 @@ void uvg_intra_recon_cu(
   bool recon_luma,
   bool recon_chroma);
 
+double uvg_recon_and_estimate_cost_isp(encoder_state_t* const state,
+  const cu_loc_t* const cu_loc,
+  double cost_treshold,
+  intra_search_data_t* const search_data,
+  lcu_t* const lcu);
+
 int8_t uvg_get_co_located_luma_mode(
   const cu_loc_t* const chroma_loc,
   const cu_loc_t* const cu_loc,
diff --git a/src/search.c b/src/search.c
index b8bb7a63..40cc012a 100644
--- a/src/search.c
+++ b/src/search.c
@@ -1364,12 +1364,27 @@ static double search_cu(
       }
 #endif
       if (state->encoder_control->cfg.cclm && tree_type != UVG_CHROMA_T && state->encoder_control->chroma_format != UVG_CSP_400) {
-        uvg_intra_recon_cu(state,
-                           &intra_search, cu_loc,
-                           &intra_search.pred_cu, lcu,
-                           tree_type,
-                           true,
-                           false);
+        if(intra_search.pred_cu.intra.isp_mode == ISP_MODE_NO_ISP) {
+          uvg_intra_recon_cu(state,
+                             &intra_search, cu_loc,
+                             &intra_search.pred_cu, lcu,
+                             tree_type,
+                             true,
+                             false);
+        }
+        else {
+          cabac_data_t temp_cabac;
+          memcpy(&temp_cabac, &state->search_cabac, sizeof(cabac_data_t));
+          state->search_cabac.update = 1;
+          uvg_recon_and_estimate_cost_isp(
+            state,
+            cu_loc,
+            0,
+            &intra_search,
+            lcu
+          );
+          memcpy(&state->search_cabac, &temp_cabac, sizeof(cabac_data_t));
+        }
 
         downsample_cclm_rec(
           state, x, y, cu_width / 2, cu_height / 2, lcu->rec.y, lcu->left_ref.y[64]
@@ -1461,7 +1476,7 @@ static double search_cu(
     if (cur_cu->type == CU_INTRA) {
 
       bool recon_chroma = true;
-      bool recon_luma = tree_type != UVG_CHROMA_T;
+      bool recon_luma = tree_type != UVG_CHROMA_T && cur_cu->intra.isp_mode == ISP_MODE_NO_ISP;
       if (is_separate_tree || !has_chroma || state->encoder_control->chroma_format == UVG_CSP_400 || tree_type == UVG_LUMA_T || cu_loc->chroma_height % 4 == 2) {
         recon_chroma = false; 
       }
@@ -1471,6 +1486,15 @@ static double search_cu(
                          NULL, lcu,
                          tree_type, 
                          recon_luma, recon_chroma);
+      if (!state->encoder_control->cfg.cclm && cur_cu->intra.isp_mode != ISP_MODE_NO_ISP) {
+        uvg_recon_and_estimate_cost_isp(
+          state,
+          cu_loc,
+          0,
+          &intra_search,
+          lcu
+        );
+      }
 
 
       if((!recon_chroma && state->encoder_control->chroma_format != UVG_CSP_400 && tree_type != UVG_LUMA_T) 
diff --git a/src/search_intra.c b/src/search_intra.c
index 8067772a..6a488952 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -264,6 +264,7 @@ static void derive_mts_constraints(cu_info_t *const pred_cu,
 }
 
 
+
 /**
 * \brief Perform search for best intra transform split configuration.
 *
@@ -353,14 +354,14 @@ static double search_intra_trdepth(
     }
     
     int start_idx = 0;
-    int end_idx = state->encoder_control->cfg.lfnst && PU_IS_TU(pred_cu) &&
+    int end_lfnst_idx = state->encoder_control->cfg.lfnst && PU_IS_TU(pred_cu) &&
                   uvg_can_use_isp_with_lfnst(width, height, pred_cu->intra.isp_mode, tree_type) ? max_lfnst_idx : 0;
-    for (int i = start_idx; i < end_idx + 1; ++i) {
+    for (int i = start_idx; i < end_lfnst_idx + 1; ++i) {
       search_data->lfnst_costs[i] = MAX_DOUBLE;
     }
 
 
-    for (int lfnst_idx = start_idx; lfnst_idx <= end_idx; lfnst_idx++) {
+    for (int lfnst_idx = start_idx; lfnst_idx <= end_lfnst_idx; lfnst_idx++) {
       // Initialize lfnst variables
       pred_cu->lfnst_idx = lfnst_idx;
       pred_cu->violates_lfnst_constrained_luma = false;
@@ -391,21 +392,32 @@ static double search_intra_trdepth(
           continue;
         }
 
-        if (!has_been_split) {
+        if (!has_been_split && (lfnst_idx != 0 || trafo != 0)) {
           memcpy(&state->search_cabac, &cabac_data, sizeof(cabac_data));
           state->search_cabac.update = 1;
         }
-
-        uvg_intra_recon_cu(
-          state,
-          search_data,
-          cu_loc,
-          pred_cu,
-          lcu,
-          UVG_LUMA_T,
-          true,
-          false
+        double rd_cost;
+        if (pred_cu->intra.isp_mode != ISP_MODE_NO_ISP) {
+          rd_cost = uvg_recon_and_estimate_cost_isp(
+            state,
+            cu_loc,
+            cost_treshold,
+            search_data,
+            lcu
           );
+        }
+        else {
+          uvg_intra_recon_cu(
+            state,
+            search_data,
+            cu_loc,
+            pred_cu,
+            lcu,
+            UVG_LUMA_T,
+            true,
+            false
+          );
+        }
         if (pred_cu->intra.isp_mode != ISP_MODE_NO_ISP && search_data->best_isp_cbfs == 0) continue;
 
         if (trafo != 0 && !cbf_is_set(pred_cu->cbf, COLOR_Y)) continue;
@@ -417,13 +429,8 @@ static double search_intra_trdepth(
             continue;
           }
         }
-
-        const unsigned scan_offset = xy_to_zorder(
-          LCU_WIDTH,
-          lcu_px.x,
-          lcu_px.y);
-
-        if (trafo != MTS_SKIP && end_idx != 0) {
+        
+        if (trafo != MTS_SKIP && end_lfnst_idx != 0) {
           uvg_derive_lfnst_constraints(
             pred_cu,
             constraints,
@@ -434,22 +441,25 @@ static double search_intra_trdepth(
             COLOR_Y);
         }
 
-        if (!constraints[1] && cbf_is_set(pred_cu->cbf, COLOR_Y)) {
+        if (!constraints[1] && (cbf_is_set(pred_cu->cbf, COLOR_Y) || pred_cu->intra.isp_mode != ISP_MODE_NO_ISP)) {
           //end_idx = 0;
           if (pred_cu->lfnst_idx > 0) {
             continue;
           }
         }
-        
-        double rd_cost = uvg_cu_rd_cost_luma(
-          state,
-          cu_loc,
-          pred_cu,
-          lcu,
-          search_data->best_isp_cbfs);
+
+
+        if (pred_cu->intra.isp_mode == ISP_MODE_NO_ISP) {
+          rd_cost = uvg_cu_rd_cost_luma(
+            state,
+            cu_loc,
+            pred_cu,
+            lcu,
+            search_data->best_isp_cbfs);
+        }
         double transform_bits = 0;
         if (state->encoder_control->cfg.lfnst && PU_IS_TU(pred_cu) &&
-            trafo != MTS_SKIP) {
+          trafo != MTS_SKIP && end_lfnst_idx != 0) {
           if (!constraints[0] && constraints[1]) {
             transform_bits += CTX_ENTROPY_FBITS(
               &state->search_cabac.ctx.lfnst_idx_model[tree_type == UVG_LUMA_T],
@@ -462,9 +472,9 @@ static double search_intra_trdepth(
           }
         }
         if (num_transforms > 2 && trafo != MTS_SKIP && width <= 32
-            /*&& height <= 32*/
+            && height <= 32
             && !pred_cu->violates_mts_coeff_constraint && pred_cu->
-            mts_last_scan_pos && lfnst_idx == 0) {
+            mts_last_scan_pos) {
 
           bool symbol = trafo != 0;
           int ctx_idx = 0;
@@ -1320,12 +1330,12 @@ static int8_t search_intra_rdo(
     can_do_isp_search = search_data[mode].pred_cu.intra.multi_ref_idx == 0 ? can_do_isp_search : false; // Cannot use ISP with MRL
     double best_isp_cost = MAX_DOUBLE;
     double best_bits = MAX_DOUBLE;
-    int8_t best_isp_mode = -1;
+    int8_t best_isp_mode = 0;
     int max_isp_modes = can_do_isp_search && uvg_can_use_isp(width, height) && state->encoder_control->cfg.isp ? NUM_ISP_MODES : 1;
 
     //
-    int best_mts_mode_for_isp[NUM_ISP_MODES] = {0};
-    int best_lfnst_mode_for_isp[NUM_ISP_MODES] = {0};
+    uint8_t best_mts_mode_for_isp[NUM_ISP_MODES] = {0};
+    uint8_t best_lfnst_mode_for_isp[NUM_ISP_MODES] = {0};
     for (int isp_mode = 0; isp_mode < max_isp_modes; ++isp_mode) {
       
 
@@ -1353,6 +1363,7 @@ static int8_t search_intra_rdo(
     search_data[mode].bits = best_bits;
     search_data[mode].pred_cu.intra.isp_mode = best_isp_mode;
     search_data[mode].pred_cu.tr_idx = best_mts_mode_for_isp[best_isp_mode];
+    search_data[mode].pred_cu.tr_skip = best_mts_mode_for_isp[best_isp_mode] == MTS_SKIP;
     search_data[mode].pred_cu.lfnst_idx = best_lfnst_mode_for_isp[best_isp_mode];
   }
 

From 7d787c6b22acec30c7600ad27cb41c59eb8b218f Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Fri, 3 Feb 2023 14:58:36 +0200
Subject: [PATCH 192/254] [ISP] Fix ISP cost calculation and DepQuant with mts

---
 src/dep_quant.c    | 13 +++++++++----
 src/intra.c        |  4 ++--
 src/search_intra.c | 23 +++++++++--------------
 3 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/src/dep_quant.c b/src/dep_quant.c
index 80a5c179..103e02d4 100644
--- a/src/dep_quant.c
+++ b/src/dep_quant.c
@@ -240,6 +240,7 @@ int uvg_init_nb_info(encoder_control_t * encoder) {
           {
             nbSbb->inPos[k] = 0;
           }
+          printf("");
         }
         {
           //===== outside subband neighbours =====
@@ -1282,8 +1283,8 @@ int uvg_dep_quant(
         encoder->m_scanId2NbInfoSbbArray[log2_tr_width][log2_tr_height][scanIdx ? scanIdx - 1 : 0],
         (zeroOut && (pos_x >= effWidth || pos_y >= effHeight)),
         q_coeff[blkpos],
-        effectWidth,
-        effectHeight
+        width,
+        height
       ); //tu.cu->slice->getReverseLastSigCoeffFlag());
     }
     else {
@@ -1304,9 +1305,13 @@ int uvg_dep_quant(
         encoder->m_scanId2NbInfoSbbArray[log2_tr_width][log2_tr_height][scanIdx ? scanIdx - 1 : 0],
         (zeroOut && (pos_x >= effWidth || pos_y >= effHeight)),
         default_quant_coeff,
-        effectWidth,
-        effectHeight); //tu.cu->slice->getReverseLastSigCoeffFlag());
+        width,
+        height); //tu.cu->slice->getReverseLastSigCoeffFlag());
     }
+    Decision temp[8];
+    Decision* decisions = ctxs->m_trellis[scanIdx];
+    memcpy(temp, decisions, sizeof(Decision) * 8);
+    decisions++;
   }
 
   //===== find best path =====
diff --git a/src/intra.c b/src/intra.c
index 026254e1..e3b61540 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -2016,7 +2016,7 @@ bool uvg_can_use_isp_with_lfnst(const int width, const int height, const int isp
     return false;
   }
   if (isp_split_type == ISP_MODE_NO_ISP) {
-    return false;
+    return true;
   }
 
   const int tu_width = (isp_split_type == ISP_MODE_HOR) ? width : uvg_get_isp_split_dim(width, height, SPLIT_TYPE_VER, true);
@@ -2062,7 +2062,7 @@ double uvg_recon_and_estimate_cost_isp(encoder_state_t* const state,
       &tu_loc, &search_data->pred_cu, lcu,
       false, UVG_LUMA_T);
 
-    int index = cu_loc->local_y * LCU_WIDTH + cu_loc->local_x;
+    int index = tu_loc.local_y * LCU_WIDTH + tu_loc.local_x;
     int ssd = uvg_pixels_calc_ssd(&lcu->ref.y[index], &lcu->rec.y[index],
       LCU_WIDTH, LCU_WIDTH,
       tu_loc.width, tu_loc.height);
diff --git a/src/search_intra.c b/src/search_intra.c
index 6a488952..0aee661c 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -361,21 +361,16 @@ static double search_intra_trdepth(
     }
 
 
-    for (int lfnst_idx = start_idx; lfnst_idx <= end_lfnst_idx; lfnst_idx++) {
-      // Initialize lfnst variables
-      pred_cu->lfnst_idx = lfnst_idx;
-      pred_cu->violates_lfnst_constrained_luma = false;
-      pred_cu->violates_lfnst_constrained_chroma = false;
-      pred_cu->lfnst_last_scan_pos = false;
-
-      //if (pred_cu->lfnst_idx != 0) {
-      //  // Cannot use ISP with LFNST for small blocks
-      //  pred_cu->intra.isp_mode = uvg_can_use_isp_with_lfnst(width, height, pred_cu->intra.isp_mode, tree_type) ? pred_cu->intra.isp_mode : ISP_MODE_NO_ISP;
-      //}
-
-      for (trafo = mts_start; trafo < num_transforms; trafo++) {
+    for (trafo = mts_start; trafo < num_transforms; trafo++) {
+      for (int lfnst_idx = start_idx; lfnst_idx <= end_lfnst_idx; lfnst_idx++) {
+        // Initialize lfnst variables
         pred_cu->tr_idx = trafo;
         pred_cu->tr_skip = trafo == MTS_SKIP;
+        pred_cu->lfnst_idx = lfnst_idx;
+        pred_cu->violates_lfnst_constrained_luma = false;
+        pred_cu->violates_lfnst_constrained_chroma = false;
+        pred_cu->lfnst_last_scan_pos = false;
+
         bool constraints[2] = {false, false};
         if (mts_enabled) {
           pred_cu->mts_last_scan_pos = 0;
@@ -1337,7 +1332,7 @@ static int8_t search_intra_rdo(
     uint8_t best_mts_mode_for_isp[NUM_ISP_MODES] = {0};
     uint8_t best_lfnst_mode_for_isp[NUM_ISP_MODES] = {0};
     for (int isp_mode = 0; isp_mode < max_isp_modes; ++isp_mode) {
-      
+       
 
       search_data[mode].pred_cu.intra.isp_mode = isp_mode;
       double rdo_bitcost = uvg_luma_mode_bits(state, &search_data[mode].pred_cu, cu_loc, lcu);

From d69bdf79f452b5ad31191088232434646156b9f4 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 8 Feb 2023 14:39:36 +0200
Subject: [PATCH 193/254] [mtt] Fix couple of issues with 64x32 CUs and non
 square tr skip rdoq

---
 src/encode_coding_tree.c | 4 ++--
 src/global.h             | 2 +-
 src/intra.c              | 2 +-
 src/rdo.c                | 3 +++
 src/search_intra.c       | 2 ++
 5 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index 31bbe003..5604aa16 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -237,7 +237,7 @@ void uvg_encode_ts_residual(encoder_state_t* const state,
   const uint32_t log2_block_width  = uvg_g_convert_to_log2[width];
   const uint32_t log2_block_height = uvg_g_convert_to_log2[height];
   // TODO: log2_cg_size is wrong if width != height
-  const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_width][log2_block_width][0] + uvg_g_log2_sbb_size[log2_block_width][log2_block_width][1];
+  const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_width][log2_block_width][0] + uvg_g_log2_sbb_size[log2_block_width][log2_block_height][1];
   
   const uint32_t* const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_mode, log2_block_width, log2_block_height);
   const uint32_t* const scan_cg = uvg_get_scan_order_table(SCAN_GROUP_UNGROUPED, scan_mode, log2_block_width, log2_block_height);
@@ -265,7 +265,7 @@ void uvg_encode_ts_residual(encoder_state_t* const state,
   bool no_sig_group_before_last = true;
 
   for (i = 0; i <= scan_cg_last; i++) {
-    if (!(width == 4 || (i ==scan_cg_last && no_sig_group_before_last))) {
+    if (!((width == 4 && height == 4) || (i ==scan_cg_last && no_sig_group_before_last))) {
       uint32_t cg_blkpos = scan_cg[i];
       uint32_t cg_pos_y = cg_blkpos / cg_width;
       uint32_t cg_pos_x = cg_blkpos - (cg_pos_y * cg_width);
diff --git a/src/global.h b/src/global.h
index 972b7e82..27058463 100644
--- a/src/global.h
+++ b/src/global.h
@@ -129,7 +129,7 @@ typedef int16_t coeff_t;
 typedef int32_t mv_t;
 
 //#define VERBOSE 1
-//#define UVG_DEBUG_PRINT_CABAC 1
+#define UVG_DEBUG_PRINT_CABAC 1
 //#define UVG_DEBUG 1
 
 //#define UVG_DEBUG_PRINT_YUVIEW_CSV 1
diff --git a/src/intra.c b/src/intra.c
index e3b61540..2dae1a6c 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -1649,7 +1649,7 @@ void uvg_intra_predict(
   }
   else {
     uvg_pixels_blit(&state->tile->frame->cclm_luma_rec[x / 2 + (y * stride) / 4], dst, width, height, stride / 2, width);
-    if (!PU_IS_TU(&data->pred_cu) || data->cclm_parameters[color == COLOR_U ? 0 : 1].b <= 0) {
+    if (width != 1 << data->pred_cu.log2_chroma_width || height != 1 << data->pred_cu.log2_chroma_height || data->cclm_parameters[color == COLOR_U ? 0 : 1].b <= 0) {
       predict_cclm(
         state, color, width, height, x, y, stride, intra_mode, lcu, refs, dst, 
         (cclm_parameters_t*)&data->cclm_parameters[color == COLOR_U ? 0 : 1],
diff --git a/src/rdo.c b/src/rdo.c
index 18f65f12..cfc03c48 100644
--- a/src/rdo.c
+++ b/src/rdo.c
@@ -1196,8 +1196,11 @@ int uvg_ts_rdoq(encoder_state_t* const state, coeff_t* src_coeff, coeff_t* dest_
 
   switch (cg_num) {
   case  1: FILL_ARRAY(sig_coeffgroup_flag, 0, 1); FILL_ARRAY(cost_coeffgroup_sig, 0, 1); break;
+  case  2: FILL_ARRAY(sig_coeffgroup_flag, 0, 2); FILL_ARRAY(cost_coeffgroup_sig, 0, 2); break;
   case  4: FILL_ARRAY(sig_coeffgroup_flag, 0, 4); FILL_ARRAY(cost_coeffgroup_sig, 0, 4);  break;
+  case  8: FILL_ARRAY(sig_coeffgroup_flag, 0, 8); FILL_ARRAY(cost_coeffgroup_sig, 0, 8);  break;
   case 16: FILL_ARRAY(sig_coeffgroup_flag, 0, 16); FILL_ARRAY(cost_coeffgroup_sig, 0, 16);  break;
+  case 32: FILL_ARRAY(sig_coeffgroup_flag, 0, 32); FILL_ARRAY(cost_coeffgroup_sig, 0, 32);  break;
   case 64: FILL_ARRAY(sig_coeffgroup_flag, 0, 64); FILL_ARRAY(cost_coeffgroup_sig, 0, 64); break;
   default: assert(0 && "There should be 1, 4, 16 or 64 coefficient groups");
   }
diff --git a/src/search_intra.c b/src/search_intra.c
index 0aee661c..1c911996 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -341,6 +341,7 @@ static double search_intra_trdepth(
     if (state->encoder_control->cfg.trskip_enable 
       && width <= (1 << state->encoder_control->cfg.trskip_max_size)
       && height <= (1 << state->encoder_control->cfg.trskip_max_size)
+      && PU_IS_TU(pred_cu)
       && pred_cu->intra.isp_mode == ISP_MODE_NO_ISP) {
       num_transforms = MAX(num_transforms, 2);
     }
@@ -378,6 +379,7 @@ static double search_intra_trdepth(
 
           if (trafo == MTS_SKIP && ((width > (1 << state->encoder_control->cfg.trskip_max_size)
             || (height > (1 << state->encoder_control->cfg.trskip_max_size)))
+            || !PU_IS_TU(pred_cu)
             || !state->encoder_control->cfg.trskip_enable)) {
             continue;
           }

From d222718c22984b0b3a70361920ba986a014a0204 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Tue, 14 Feb 2023 10:36:01 +0200
Subject: [PATCH 194/254] [mtt] Minor fixes

---
 src/cli.c                | 10 +++++-----
 src/cu.c                 | 28 +++++++++++-----------------
 src/cu.h                 |  3 ++-
 src/encode_coding_tree.c |  2 +-
 src/global.h             |  2 +-
 src/search.c             |  4 +++-
 src/search_intra.c       |  2 +-
 7 files changed, 24 insertions(+), 27 deletions(-)

diff --git a/src/cli.c b/src/cli.c
index ce238506..b7c56efb 100644
--- a/src/cli.c
+++ b/src/cli.c
@@ -192,11 +192,11 @@ static const struct option long_options[] = {
   { "dual-tree",                no_argument, NULL, 0 },
   { "no-dual-tree",             no_argument, NULL, 0 },
   { "cabac-debug-file",   required_argument, NULL, 0 },
-  {"mtt-depth-intra",     required_argument, NULL, 0 },
-  {"mtt-depth-inter",     required_argument, NULL, 0 },
-  {"mtt-depth-intra-chroma", required_argument, NULL, 0 },
-  {"max_bt_size",         required_argument, NULL, 0 },
-  {"max_tt_size",         required_argument, NULL, 0 },
+  { "mtt-depth-intra",    required_argument, NULL, 0 },
+  { "mtt-depth-inter",    required_argument, NULL, 0 },
+  { "mtt-depth-intra-chroma", required_argument, NULL, 0 },
+  { "max-bt-size",        required_argument, NULL, 0 },
+  { "max-tt-size",        required_argument, NULL, 0 },
   { "intra-rough-granularity",required_argument, NULL, 0 },
   { "ibc",                required_argument, NULL, 0 },
   { "dep-quant",                no_argument, NULL, 0 },
diff --git a/src/cu.c b/src/cu.c
index 147875fb..301ca100 100644
--- a/src/cu.c
+++ b/src/cu.c
@@ -373,18 +373,11 @@ int uvg_get_split_locs(
 int uvg_get_implicit_split(
   const encoder_state_t* const state,
   const cu_loc_t* const cu_loc,
-  enum
-  uvg_tree_type tree_type,
-  uint8_t max_mtt_depth)
+  uint8_t max_mtt_depth,
+  bool uses_chroma_coordinates)
 {
-  // This checking if cabac is in update state is a very dirty way of checking
-  // whether we are in the search or writing the bitstream, and unfortunately the
-  // coordinates are different for chroma tree in those two conditions. It might be
-  // possible to pass the chroma loc for uvg_get_possible_splits in the search but
-  // then all of the conditions need to be checked in that function.
-  // This current solutions *might* not work with alf enabled but I think it should work
-  bool right_ok = (state->tile->frame->width >> (tree_type == UVG_CHROMA_T && state->cabac.update)) >= cu_loc->x + cu_loc->width;
-  bool bottom_ok = (state->tile->frame->height >> (tree_type == UVG_CHROMA_T && state->cabac.update)) >= cu_loc->y + cu_loc->height;
+  bool right_ok = (state->tile->frame->width >> uses_chroma_coordinates) >= cu_loc->x + cu_loc->width;
+  bool bottom_ok = (state->tile->frame->height >> uses_chroma_coordinates) >= cu_loc->y + cu_loc->height;
 
   if (right_ok && bottom_ok) return NO_SPLIT;
   if (right_ok && max_mtt_depth != 0) return BT_HOR_SPLIT;
@@ -394,10 +387,11 @@ int uvg_get_implicit_split(
 
 
 int uvg_get_possible_splits(const encoder_state_t * const state,
-  const cu_loc_t * const cu_loc, split_tree_t split_tree, enum uvg_tree_type tree_type, bool splits[6])
+                            const cu_loc_t * const cu_loc, split_tree_t split_tree, enum uvg_tree_type tree_type, bool splits[6], bool
+                            use_chroma_coordinates)
 {
-  const int width = tree_type != UVG_CHROMA_T ? cu_loc->width : cu_loc->chroma_width;
-  const int height = tree_type != UVG_CHROMA_T ? cu_loc->height : cu_loc->chroma_height;
+  const unsigned width = tree_type != UVG_CHROMA_T ? cu_loc->width : cu_loc->chroma_width;
+  const unsigned height = tree_type != UVG_CHROMA_T ? cu_loc->height : cu_loc->chroma_height;
   const int slice_type = state->frame->is_irap ? (tree_type == UVG_CHROMA_T ? 2 : 0) : 1;
 
   const unsigned max_btd =
@@ -408,7 +402,7 @@ int uvg_get_possible_splits(const encoder_state_t * const state,
   const unsigned min_tt_size = 1 << MIN_SIZE >> (tree_type == UVG_CHROMA_T);
   const unsigned min_qt_size = state->encoder_control->cfg.min_qt_size[slice_type];
 
-  const enum split_type implicitSplit = uvg_get_implicit_split(state, cu_loc, tree_type, max_btd);
+  const enum split_type implicitSplit = uvg_get_implicit_split(state, cu_loc, max_btd, use_chroma_coordinates);
   
   splits[NO_SPLIT] = splits[QT_SPLIT] = splits[BT_HOR_SPLIT] = splits[TT_HOR_SPLIT] = splits[BT_VER_SPLIT] = splits[TT_VER_SPLIT] = true;
   bool can_btt = split_tree.mtt_depth < max_btd;
@@ -426,8 +420,8 @@ int uvg_get_possible_splits(const encoder_state_t * const state,
   {
     splits[NO_SPLIT] = splits[TT_HOR_SPLIT] = splits[TT_VER_SPLIT] = false;
 
-    splits[BT_HOR_SPLIT] = implicitSplit == BT_HOR_SPLIT;
-    splits[BT_VER_SPLIT] = implicitSplit == BT_VER_SPLIT;
+    splits[BT_HOR_SPLIT] = implicitSplit == BT_HOR_SPLIT && height <= max_bt_size;
+    splits[BT_VER_SPLIT] = implicitSplit == BT_VER_SPLIT && width <= max_bt_size;
     if (tree_type == UVG_CHROMA_T && width == 4) splits[BT_VER_SPLIT] = false;
     if (!splits[BT_HOR_SPLIT] && !splits[BT_VER_SPLIT] && !splits[QT_SPLIT]) splits[QT_SPLIT] = true;
     return 1;
diff --git a/src/cu.h b/src/cu.h
index 843fe582..36cfb239 100644
--- a/src/cu.h
+++ b/src/cu.h
@@ -203,7 +203,8 @@ int uvg_get_split_locs(
   cu_loc_t out[4],
   uint8_t* separate_chroma);
 int uvg_get_possible_splits(const encoder_state_t* const state,
-  const cu_loc_t* const cu_loc, split_tree_t split_tree, enum uvg_tree_type tree_type, bool splits[6]);
+                            const cu_loc_t* const cu_loc, split_tree_t split_tree, enum uvg_tree_type tree_type, bool splits[6], bool
+                            use_chroma_coordinates);
 
 
 #define CU_GET_MV_CAND(cu_info_ptr, reflist) \
diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index 5604aa16..a23de174 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -1251,7 +1251,7 @@ uint8_t uvg_write_split_flag(
 
 
   bool can_split[6];
-  const bool is_implicit = uvg_get_possible_splits(state, cu_loc, split_tree, tree_type, can_split);
+  const bool is_implicit = uvg_get_possible_splits(state, cu_loc, split_tree, tree_type, can_split, tree_type == UVG_CHROMA_T);
 
 
   bool allow_split = can_split[1] || can_split[2] || can_split[3] || can_split[4] || can_split[5];
diff --git a/src/global.h b/src/global.h
index 27058463..972b7e82 100644
--- a/src/global.h
+++ b/src/global.h
@@ -129,7 +129,7 @@ typedef int16_t coeff_t;
 typedef int32_t mv_t;
 
 //#define VERBOSE 1
-#define UVG_DEBUG_PRINT_CABAC 1
+//#define UVG_DEBUG_PRINT_CABAC 1
 //#define UVG_DEBUG 1
 
 //#define UVG_DEBUG_PRINT_YUVIEW_CSV 1
diff --git a/src/search.c b/src/search.c
index 40cc012a..f5f6e044 100644
--- a/src/search.c
+++ b/src/search.c
@@ -1600,6 +1600,8 @@ static double search_cu(
   cu_loc_t separate_tree_chroma_loc = *cu_loc;
   separate_tree_chroma_loc.y >>= 1;
   separate_tree_chroma_loc.x >>= 1;
+  separate_tree_chroma_loc.width >>= 1;
+  separate_tree_chroma_loc.height >>= 1;
 
   if (cur_cu->type == CU_INTRA || cur_cu->type == CU_INTER) {
     double bits = 0;
@@ -1689,7 +1691,7 @@ static double search_cu(
   }
 
   bool can_split[6];
-  bool is_implicit = uvg_get_possible_splits(state, cu_loc, split_tree, tree_type, can_split);
+  bool is_implicit = uvg_get_possible_splits(state, cu_loc, split_tree, tree_type, can_split, false);
 
   const int slice_type = state->frame->is_irap ? (tree_type == UVG_CHROMA_T ? 2 : 0) : 1;
   const int max_btd = state->encoder_control->cfg.max_btt_depth[slice_type];
diff --git a/src/search_intra.c b/src/search_intra.c
index 1c911996..689d872d 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -417,7 +417,7 @@ static double search_intra_trdepth(
         }
         if (pred_cu->intra.isp_mode != ISP_MODE_NO_ISP && search_data->best_isp_cbfs == 0) continue;
 
-        if (trafo != 0 && !cbf_is_set(pred_cu->cbf, COLOR_Y)) continue;
+        if ((trafo != 0 || lfnst_idx != 0) && !cbf_is_set(pred_cu->cbf, COLOR_Y)) continue;
         
         derive_mts_constraints(pred_cu, lcu, width, height, lcu_px);
         if (pred_cu->tr_idx > 1) {

From 0f50caa2d0c558d07175d6f19dd34ab85920a294 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 15 Feb 2023 14:23:55 +0200
Subject: [PATCH 195/254] [mtt] Fix various small issues and DepQuant for
 non-square blocks

---
 src/dep_quant.c    | 11 ++++-------
 src/intra.c        |  2 +-
 src/search.c       |  2 +-
 src/search_intra.c |  9 ++++++---
 src/transform.c    |  6 +++---
 5 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/dep_quant.c b/src/dep_quant.c
index 103e02d4..a41bf6c5 100644
--- a/src/dep_quant.c
+++ b/src/dep_quant.c
@@ -1153,7 +1153,7 @@ int uvg_dep_quant(
 
   int32_t qp_scaled = uvg_get_scaled_qp(compID, state->qp, (encoder->bitdepth - 8) * 6, encoder->qp_map[0]);
   qp_scaled = is_ts ? MAX(qp_scaled, 4 + 6 * MIN_QP_PRIME_TS) : qp_scaled;
-  bool needs_block_size_trafo_scale = is_ts && ((log2_tr_height + log2_tr_width) % 2 == 1);
+  bool needs_block_size_trafo_scale = !is_ts && ((log2_tr_height + log2_tr_width) % 2 == 1);
   needs_block_size_trafo_scale |= 0; // Non log2 block size
 
   const int32_t scalinglist_type = (cur_tu->type == CU_INTRA ? 0 : 3) + (int8_t)compID;
@@ -1252,8 +1252,8 @@ int uvg_dep_quant(
     uint32_t  pos_y_next = blkpos_next >> log2_tr_width;
     uint32_t  pos_x_next = blkpos_next - (pos_y_next << log2_tr_width);
     uint32_t cg_blockpos_next = scanIdx ? cg_scan[(scanIdx -1) >> 4] : 0;
-    uint32_t cg_pos_y_next = cg_blockpos_next / height_in_sbb;
-    uint32_t cg_pos_x_next = cg_blockpos_next - (cg_pos_y_next * height_in_sbb);
+    uint32_t cg_pos_y_next = cg_blockpos_next / width_in_sbb;
+    uint32_t cg_pos_x_next = cg_blockpos_next - (cg_pos_y_next * width_in_sbb);
     uint32_t diag = pos_y_next + pos_x_next;
 
     uint32_t sig_ctx_offset = compID == COLOR_Y ? (diag < 2 ? 8 : diag < 5 ? 4 : 0) : (diag < 2 ? 4 : 0);
@@ -1308,10 +1308,7 @@ int uvg_dep_quant(
         width,
         height); //tu.cu->slice->getReverseLastSigCoeffFlag());
     }
-    Decision temp[8];
-    Decision* decisions = ctxs->m_trellis[scanIdx];
-    memcpy(temp, decisions, sizeof(Decision) * 8);
-    decisions++;
+
   }
 
   //===== find best path =====
diff --git a/src/intra.c b/src/intra.c
index 2dae1a6c..3d2c1f81 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -2070,7 +2070,7 @@ double uvg_recon_and_estimate_cost_isp(encoder_state_t* const state,
 
 
     int cbf = cbf_is_set(search_data->pred_cu.cbf, COLOR_Y);
-    if (i + 1 != split_limit && search_data->best_isp_cbfs != 0) {
+    if (i + 1 != split_limit || search_data->best_isp_cbfs != 1 << (split_limit - 1)) {
       CABAC_FBITS_UPDATE(&state->search_cabac, &state->search_cabac.ctx.qt_cbf_model_luma[cbf_context], cbf, coeff_bits, "cbf_luma_isp_recon");
     }
     cost += ssd + coeff_bits * state->lambda;
diff --git a/src/search.c b/src/search.c
index f5f6e044..8d7e343d 100644
--- a/src/search.c
+++ b/src/search.c
@@ -1798,7 +1798,7 @@ static double search_cu(
 
       // 3.9
       const double factor    = state->qp > 30 ? 1.1 : 1.075;
-      if (split_bits * state->frame->lambda + cost / factor > cost) {
+      if (split_bits * state->lambda + cost / factor > cost) {
         can_split[split_type] = false;
         continue;
       }
diff --git a/src/search_intra.c b/src/search_intra.c
index 689d872d..83ec950d 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -457,7 +457,7 @@ static double search_intra_trdepth(
         double transform_bits = 0;
         if (state->encoder_control->cfg.lfnst && PU_IS_TU(pred_cu) &&
           trafo != MTS_SKIP && end_lfnst_idx != 0) {
-          if (!constraints[0] && constraints[1]) {
+          if ((!constraints[0] && constraints[1]) || lfnst_idx != 0) {
             transform_bits += CTX_ENTROPY_FBITS(
               &state->search_cabac.ctx.lfnst_idx_model[tree_type == UVG_LUMA_T],
               lfnst_idx != 0);
@@ -468,7 +468,10 @@ static double search_intra_trdepth(
             }
           }
         }
-        if (num_transforms > 2 && trafo != MTS_SKIP && width <= 32
+        if (num_transforms > 2 && trafo != MTS_SKIP
+            && pred_cu->intra.isp_mode == ISP_MODE_NO_ISP
+            && lfnst_idx == 0
+            && width <= 32
             && height <= 32
             && !pred_cu->violates_mts_coeff_constraint && pred_cu->
             mts_last_scan_pos) {
@@ -488,7 +491,7 @@ static double search_intra_trdepth(
           }
 
         }
-        rd_cost += transform_bits * state->frame->lambda;
+        rd_cost += transform_bits * state->lambda;
 
         search_data->lfnst_costs[lfnst_idx] = MIN(
           search_data->lfnst_costs[lfnst_idx],
diff --git a/src/transform.c b/src/transform.c
index 2ee2fc32..783d9f2b 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -782,8 +782,8 @@ void uvg_chroma_transform_search(
       pred_cu->violates_lfnst_constrained_chroma = false;
     }
     if (!is_jccr) {
-      double u_cost = UVG_CHROMA_MULT * ssd_u + u_bits * state->frame->lambda;
-      double v_cost = UVG_CHROMA_MULT * ssd_v + v_bits * state->frame->lambda;
+      double u_cost = UVG_CHROMA_MULT * ssd_u + u_bits * state->c_lambda;
+      double v_cost = UVG_CHROMA_MULT * ssd_v + v_bits * state->c_lambda;
       if (u_cost < chorma_ts_out->best_u_cost) {
         chorma_ts_out->best_u_cost = u_cost;
         chorma_ts_out->best_u_index = u_has_coeffs ? transforms[i] : NO_RESIDUAL;
@@ -794,7 +794,7 @@ void uvg_chroma_transform_search(
       }
     }
     else {
-      double cost = UVG_CHROMA_MULT * (ssd_u + ssd_v) + (u_bits + v_bits) * state->frame->lambda;
+      double cost = UVG_CHROMA_MULT * (ssd_u + ssd_v) + (u_bits + v_bits) * state->c_lambda;
       if (cost < chorma_ts_out->best_combined_cost) {
         chorma_ts_out->best_combined_cost = cost;
         chorma_ts_out->best_combined_index = transforms[i];

From 146e1cb85e9790abbddf8ad93656af859b6839bf Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 22 Feb 2023 14:48:00 +0200
Subject: [PATCH 196/254] [dual-tree] WIP simplification

---
 src/cu.c                 | 40 +++++++++---------
 src/cu.h                 |  6 +--
 src/encode_coding_tree.c | 55 +++++++++---------------
 src/encoderstate.c       | 27 ++++++------
 src/filter.c             |  6 +--
 src/intra.c              | 48 ++++++++-------------
 src/intra.h              |  5 +--
 src/search.c             | 90 ++++++++++++++++++++--------------------
 src/search_intra.c       | 20 ++++-----
 9 files changed, 129 insertions(+), 168 deletions(-)

diff --git a/src/cu.c b/src/cu.c
index 301ca100..56408b33 100644
--- a/src/cu.c
+++ b/src/cu.c
@@ -276,10 +276,10 @@ cu_array_t * uvg_cu_array_copy_ref(cu_array_t* cua)
  * \param dst_y   y-coordinate of the top edge of the copied area in dst
  * \param src     source lcu
  */
-void uvg_cu_array_copy_from_lcu(cu_array_t* dst, int dst_x, int dst_y, const lcu_t *src, enum uvg_tree_type tree_type)
+void uvg_cu_array_copy_from_lcu(cu_array_t* dst, int dst_x, int dst_y, const lcu_t *src)
 {
   const int dst_stride = dst->stride >> 2;
-  const int width = tree_type != UVG_CHROMA_T ? LCU_WIDTH : LCU_WIDTH_C;
+  const int width = LCU_WIDTH;
   for (int y = 0; y < width; y += SCU_WIDTH) {
     for (int x = 0; x < width; x += SCU_WIDTH) {
       const cu_info_t *from_cu = LCU_GET_CU_AT_PX(src, x, y);
@@ -373,11 +373,10 @@ int uvg_get_split_locs(
 int uvg_get_implicit_split(
   const encoder_state_t* const state,
   const cu_loc_t* const cu_loc,
-  uint8_t max_mtt_depth,
-  bool uses_chroma_coordinates)
+  uint8_t max_mtt_depth)
 {
-  bool right_ok = (state->tile->frame->width >> uses_chroma_coordinates) >= cu_loc->x + cu_loc->width;
-  bool bottom_ok = (state->tile->frame->height >> uses_chroma_coordinates) >= cu_loc->y + cu_loc->height;
+  bool right_ok = (state->tile->frame->width) >= cu_loc->x + cu_loc->width;
+  bool bottom_ok = (state->tile->frame->height) >= cu_loc->y + cu_loc->height;
 
   if (right_ok && bottom_ok) return NO_SPLIT;
   if (right_ok && max_mtt_depth != 0) return BT_HOR_SPLIT;
@@ -387,22 +386,21 @@ int uvg_get_implicit_split(
 
 
 int uvg_get_possible_splits(const encoder_state_t * const state,
-                            const cu_loc_t * const cu_loc, split_tree_t split_tree, enum uvg_tree_type tree_type, bool splits[6], bool
-                            use_chroma_coordinates)
+                            const cu_loc_t * const cu_loc, split_tree_t split_tree, enum uvg_tree_type tree_type, bool splits[6])
 {
-  const unsigned width = tree_type != UVG_CHROMA_T ? cu_loc->width : cu_loc->chroma_width;
-  const unsigned height = tree_type != UVG_CHROMA_T ? cu_loc->height : cu_loc->chroma_height;
+  const unsigned width = cu_loc->width;
+  const unsigned height = cu_loc->height;
   const int slice_type = state->frame->is_irap ? (tree_type == UVG_CHROMA_T ? 2 : 0) : 1;
 
   const unsigned max_btd =
     state->encoder_control->cfg.max_btt_depth[slice_type] + split_tree.implicit_mtt_depth;
-  const unsigned max_bt_size = state->encoder_control->cfg.max_bt_size[slice_type] >> (tree_type == UVG_CHROMA_T);
-  const unsigned min_bt_size = 1 << MIN_SIZE >> (tree_type == UVG_CHROMA_T);
-  const unsigned max_tt_size = state->encoder_control->cfg.max_tt_size[slice_type] >> (tree_type == UVG_CHROMA_T);
-  const unsigned min_tt_size = 1 << MIN_SIZE >> (tree_type == UVG_CHROMA_T);
+  const unsigned max_bt_size = state->encoder_control->cfg.max_bt_size[slice_type];
+  const unsigned min_bt_size = 1 << MIN_SIZE;
+  const unsigned max_tt_size = state->encoder_control->cfg.max_tt_size[slice_type];
+  const unsigned min_tt_size = 1 << MIN_SIZE;
   const unsigned min_qt_size = state->encoder_control->cfg.min_qt_size[slice_type];
 
-  const enum split_type implicitSplit = uvg_get_implicit_split(state, cu_loc, max_btd, use_chroma_coordinates);
+  const enum split_type implicitSplit = uvg_get_implicit_split(state, cu_loc, max_btd);
   
   splits[NO_SPLIT] = splits[QT_SPLIT] = splits[BT_HOR_SPLIT] = splits[TT_HOR_SPLIT] = splits[BT_VER_SPLIT] = splits[TT_VER_SPLIT] = true;
   bool can_btt = split_tree.mtt_depth < max_btd;
@@ -414,7 +412,7 @@ int uvg_get_possible_splits(const encoder_state_t * const state,
   if (split_tree.current_depth != 0 && last_split != QT_SPLIT /* && !(width > 64 || height > 64)*/) splits[QT_SPLIT] = false;
   if (width <= min_qt_size)                              splits[QT_SPLIT] = false;
 
-  if (tree_type == UVG_CHROMA_T && width <= 4) splits[QT_SPLIT] = false;
+  if (tree_type == UVG_CHROMA_T && width <= 8) splits[QT_SPLIT] = false;
 
   if (implicitSplit != NO_SPLIT)
   {
@@ -422,7 +420,7 @@ int uvg_get_possible_splits(const encoder_state_t * const state,
 
     splits[BT_HOR_SPLIT] = implicitSplit == BT_HOR_SPLIT && height <= max_bt_size;
     splits[BT_VER_SPLIT] = implicitSplit == BT_VER_SPLIT && width <= max_bt_size;
-    if (tree_type == UVG_CHROMA_T && width == 4) splits[BT_VER_SPLIT] = false;
+    if (tree_type == UVG_CHROMA_T && width <= 8) splits[BT_VER_SPLIT] = false;
     if (!splits[BT_HOR_SPLIT] && !splits[BT_VER_SPLIT] && !splits[QT_SPLIT]) splits[QT_SPLIT] = true;
     return 1;
   }
@@ -459,23 +457,23 @@ int uvg_get_possible_splits(const encoder_state_t * const state,
   // specific check for BT splits
   if (height <= min_bt_size)                            splits[BT_HOR_SPLIT] = false;
   if (width > 64 && height <= 64) splits[BT_HOR_SPLIT] = false;
-  if (tree_type == UVG_CHROMA_T && width * height <= 16)     splits[BT_HOR_SPLIT] = false;
+  if (tree_type == UVG_CHROMA_T && width * height <= 64)     splits[BT_HOR_SPLIT] = false;
 
   if (width <= min_bt_size)                              splits[BT_VER_SPLIT] = false;
   if (width <= 64 && height > 64) splits[BT_VER_SPLIT] = false;
-  if (tree_type == UVG_CHROMA_T && (width * height <= 16 || width == 4))     splits[BT_VER_SPLIT] = false;
+  if (tree_type == UVG_CHROMA_T && (width * height <= 64 || width <= 8))     splits[BT_VER_SPLIT] = false;
 
   //if (modeType == MODE_TYPE_INTER && width * height == 32)  splits[BT_VER_SPLIT] = splits[BT_HOR_SPLIT] = false;
 
   if (height <= 2 * min_tt_size || height > max_tt_size || width > max_tt_size)
     splits[TT_HOR_SPLIT] = false;
   if (width > 64 || height > 64)  splits[TT_HOR_SPLIT] = false;
-  if (tree_type == UVG_CHROMA_T && width * height <= 16 * 2)     splits[TT_HOR_SPLIT] = false;
+  if (tree_type == UVG_CHROMA_T && width * height <= 64 * 2)     splits[TT_HOR_SPLIT] = false;
 
   if (width <= 2 * min_tt_size || width > max_tt_size || height > max_tt_size)
     splits[TT_VER_SPLIT] = false;
   if (width > 64 || height > 64)  splits[TT_VER_SPLIT] = false;
-  if (tree_type == UVG_CHROMA_T && (width * height <= 16 * 2 || width == 8))     splits[TT_VER_SPLIT] = false;
+  if (tree_type == UVG_CHROMA_T && (width * height <= 64 * 2 || width <= 16))     splits[TT_VER_SPLIT] = false;
 
   //if (modeType == MODE_TYPE_INTER && width * height == 64)  splits[TT_VER_SPLIT] = splits[TT_HOR_SPLIT] = false;
   return 0;
diff --git a/src/cu.h b/src/cu.h
index 36cfb239..87265ee9 100644
--- a/src/cu.h
+++ b/src/cu.h
@@ -203,8 +203,7 @@ int uvg_get_split_locs(
   cu_loc_t out[4],
   uint8_t* separate_chroma);
 int uvg_get_possible_splits(const encoder_state_t* const state,
-                            const cu_loc_t* const cu_loc, split_tree_t split_tree, enum uvg_tree_type tree_type, bool splits[6], bool
-                            use_chroma_coordinates);
+                            const cu_loc_t* const cu_loc, split_tree_t split_tree, enum uvg_tree_type tree_type, bool splits[6]);
 
 
 #define CU_GET_MV_CAND(cu_info_ptr, reflist) \
@@ -383,8 +382,7 @@ typedef struct {
   cu_info_t cu[LCU_T_CU_WIDTH * LCU_T_CU_WIDTH + 1];
 } lcu_t;
 
-void uvg_cu_array_copy_from_lcu(cu_array_t* dst, int dst_x, int dst_y, const lcu_t *src, enum uvg_tree_type
-                                tree_type);
+void uvg_cu_array_copy_from_lcu(cu_array_t* dst, int dst_x, int dst_y, const lcu_t *src);
 
 int uvg_count_available_edge_cus(const cu_loc_t* const cu_loc, const lcu_t* const lcu, bool left);
 int uvg_count_chroma_tree_available_edge_cus(int x, int y, int width, int height, const lcu_t* const lcu, bool left);
diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index a23de174..1d121d18 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -475,14 +475,12 @@ static void encode_chroma_tu(
   cu_info_t* cur_pu,
   int8_t* scan_idx,
   lcu_coeff_t* coeff,
-  uint8_t joint_chroma,
-  enum
-  uvg_tree_type tree_type)
+  uint8_t joint_chroma)
 {
   int width_c = cu_loc->chroma_width;
   int height_c = cu_loc->chroma_height;
-  int x_local = (cu_loc->x >> (tree_type != UVG_CHROMA_T)) % LCU_WIDTH_C;
-  int y_local = (cu_loc->y >> (tree_type != UVG_CHROMA_T)) % LCU_WIDTH_C;
+  int x_local = (cu_loc->x >> 1) % LCU_WIDTH_C;
+  int y_local = (cu_loc->y >> 1) % LCU_WIDTH_C;
   cabac_data_t* const cabac = &state->cabac;
   *scan_idx = SCAN_DIAG;
   if(!joint_chroma){
@@ -615,7 +613,7 @@ static void encode_transform_unit(
   if ((chroma_cbf_set || joint_chroma) && last_split && chroma_loc) {
     //Need to drop const to get lfnst constraints
     // Use original dimensions instead of ISP split dimensions
-    encode_chroma_tu(state, chroma_loc, (cu_info_t*)cur_pu, &scan_idx, coeff, joint_chroma, tree_type);
+    encode_chroma_tu(state, chroma_loc, (cu_info_t*)cur_pu, &scan_idx, coeff, joint_chroma);
   }
 }
 
@@ -657,7 +655,7 @@ static void encode_transform_coeff(
     cur_tu = uvg_cu_array_at_const(used_array, x, y);
   }
 
-  const int tr_limit = (TR_MAX_WIDTH >> (tree_type == UVG_CHROMA_T));
+  const int tr_limit = TR_MAX_WIDTH;
   const bool ver_split = cu_loc->height > tr_limit;
   const bool hor_split = cu_loc->width > tr_limit;
 
@@ -681,10 +679,6 @@ static void encode_transform_coeff(
     cu_loc_t split_cu_loc[4];
     const int split_count = uvg_get_split_locs(cu_loc, split, split_cu_loc,NULL);
     for (int i = 0; i < split_count; ++i) {
-      if(tree_type == UVG_CHROMA_T) {
-        split_cu_loc[i].chroma_width = split_cu_loc[i].width;
-        split_cu_loc[i].chroma_height = split_cu_loc[i].height;
-      }
       encode_transform_coeff(state, &split_cu_loc[i], only_chroma,
         coeff, NULL, tree_type, true, false, luma_cbf_ctx, &split_cu_loc[i], chroma_loc ? &split_cu_loc[i] : NULL);
     }
@@ -1246,12 +1240,12 @@ uint8_t uvg_write_split_flag(
   // Implisit split flag when on border
   // Exception made in VVC with flag not being implicit if the BT can be used for
   // horizontal or vertical split, then this flag tells if QT or BT is used
-  const int cu_width = tree_type != UVG_CHROMA_T ? cu_loc->width : cu_loc->chroma_width;
-  const int cu_height = tree_type != UVG_CHROMA_T ? cu_loc->height : cu_loc->chroma_height;
+  const int cu_width =  cu_loc->width;
+  const int cu_height =  cu_loc->height;
 
 
   bool can_split[6];
-  const bool is_implicit = uvg_get_possible_splits(state, cu_loc, split_tree, tree_type, can_split, tree_type == UVG_CHROMA_T);
+  const bool is_implicit = uvg_get_possible_splits(state, cu_loc, split_tree, tree_type, can_split);
 
 
   bool allow_split = can_split[1] || can_split[2] || can_split[3] || can_split[4] || can_split[5];
@@ -1354,11 +1348,11 @@ void uvg_encode_coding_tree(
   const videoframe_t * const frame = state->tile->frame;
   const cu_array_t* used_array = tree_type != UVG_CHROMA_T ? frame->cu_array : frame->chroma_cu_array;
   
-  const int cu_width  = tree_type != UVG_CHROMA_T ? cu_loc->width : cu_loc->chroma_width;
-  const int cu_height = tree_type != UVG_CHROMA_T ? cu_loc->height : cu_loc->chroma_height;
+  const int cu_width  = cu_loc->width;
+  const int cu_height = cu_loc->height;
  
-  const int x = tree_type != UVG_CHROMA_T ? cu_loc->x : chroma_loc->x;
-  const int y = tree_type != UVG_CHROMA_T ? cu_loc->y : chroma_loc->y;
+  const int x = cu_loc->x;
+  const int y = cu_loc->y;
 
   const cu_info_t* cur_cu = uvg_cu_array_at_const(used_array, x, y);
 
@@ -1375,11 +1369,11 @@ void uvg_encode_coding_tree(
 
 
   // Absolute coordinates
-  uint16_t abs_x = x + (state->tile->offset_x >> (tree_type == UVG_CHROMA_T));
-  uint16_t abs_y = y + (state->tile->offset_y >> (tree_type == UVG_CHROMA_T));
+  uint16_t abs_x = x + state->tile->offset_x;
+  uint16_t abs_y = y + state->tile->offset_y ;
 
-  int32_t frame_width = tree_type !=  UVG_CHROMA_T ? ctrl->in.width : ctrl->in.width / 2;
-  int32_t frame_height = tree_type != UVG_CHROMA_T ? ctrl->in.height : ctrl->in.height / 2;
+  int32_t frame_width =  ctrl->in.width;
+  int32_t frame_height =  ctrl->in.height;
 
   // Stop if we are outside of the frame
   if (abs_x >= frame_width || abs_y >= frame_height) return;
@@ -1412,25 +1406,14 @@ void uvg_encode_coding_tree(
       0};
 
       cu_loc_t new_cu_loc[4];
-      cu_loc_t chroma_tree_loc;
       uint8_t separate_chroma = 0;
       const int splits = uvg_get_split_locs(cu_loc, split_flag, new_cu_loc, &separate_chroma);
       separate_chroma |= !has_chroma;
       for (int split = 0; split <splits; ++split) {
         new_split_tree.part_index = split;
-        if (tree_type == UVG_CHROMA_T) {
-          chroma_tree_loc = new_cu_loc[split];
-          chroma_tree_loc.x >>= 1;
-          chroma_tree_loc.y >>= 1;
-          chroma_tree_loc.local_x = chroma_tree_loc.x & LCU_WIDTH_C;
-          chroma_tree_loc.local_y = chroma_tree_loc.y & LCU_WIDTH_C;
-          chroma_tree_loc.width >>= 1;
-          chroma_tree_loc.height >>= 1;
-          assert(!separate_chroma);
-        }
         uvg_encode_coding_tree(state, coeff, tree_type,
           &new_cu_loc[split], 
-          separate_chroma ? chroma_loc :(tree_type == UVG_CHROMA_T ? &chroma_tree_loc :  &new_cu_loc[split]),
+          separate_chroma ? chroma_loc : &new_cu_loc[split],
           new_split_tree, !separate_chroma || (split == splits - 1 && has_chroma));
       }
       return;
@@ -1714,8 +1697,8 @@ double uvg_mock_encode_coding_unit(
 
   const uint8_t depth = 6 - uvg_g_convert_to_log2[cu_loc->width];
 
-  int x_local = cu_loc->local_x >> (tree_type == UVG_CHROMA_T);
-  int y_local = cu_loc->local_y >> (tree_type == UVG_CHROMA_T);
+  int x_local = cu_loc->local_x;
+  int y_local = cu_loc->local_y;
   const bool is_separate_tree = chroma_loc == NULL || cu_loc->height != chroma_loc->height || cu_loc->width != chroma_loc->width;
     
   const cu_info_t* left_cu = NULL, *above_cu = NULL;
diff --git a/src/encoderstate.c b/src/encoderstate.c
index 32ecfeac..e8a43548 100644
--- a/src/encoderstate.c
+++ b/src/encoderstate.c
@@ -890,12 +890,6 @@ static void encoder_state_worker_encode_lcu_bitstream(void * opaque)
   if(tree_type == UVG_LUMA_T && state->encoder_control->chroma_format != UVG_CSP_400) {
     uvg_cu_loc_ctor(&start, lcu->position.x * LCU_WIDTH, lcu->position.y * LCU_WIDTH, LCU_WIDTH, LCU_WIDTH);
     cu_loc_t chroma_tree_loc = start;
-    chroma_tree_loc.x >>= 1;
-    chroma_tree_loc.y >>= 1;
-    chroma_tree_loc.local_x = chroma_tree_loc.x & LCU_WIDTH_C;
-    chroma_tree_loc.local_y = chroma_tree_loc.y & LCU_WIDTH_C;
-    chroma_tree_loc.width >>= 1;
-    chroma_tree_loc.height >>= 1;
     uvg_encode_coding_tree(state, lcu->coeff, UVG_CHROMA_T, &start, &chroma_tree_loc, split_tree, true);
   }
 
@@ -1175,6 +1169,12 @@ static void encoder_state_encode_leaf(encoder_state_t * const state)
           uvg_threadqueue_submit(state->encoder_control->threadqueue, job[0]);
 
           uvg_threadqueue_job_dep_add(state->tile->wf_jobs[lcu->id], state->tile->wf_recon_jobs[lcu->id]);
+#ifdef UVG_DEBUG_PRINT_CABAC
+          // Ensures that the ctus are encoded in raster scan order
+          if(i >= state->tile->frame->width_in_lcu) {
+            uvg_threadqueue_job_dep_add(state->tile->wf_jobs[lcu->id], state->tile->wf_recon_jobs[(lcu->id / state->tile->frame->width_in_lcu - 1) * state->tile->frame->width_in_lcu]);
+          }
+#endif
         }
 
         uvg_threadqueue_submit(state->encoder_control->threadqueue, state->tile->wf_jobs[lcu->id]);
@@ -1307,10 +1307,10 @@ static void encoder_state_encode(encoder_state_t * const main_state) {
         if(main_state->encoder_control->cfg.dual_tree){
           sub_state->tile->frame->chroma_cu_array = uvg_cu_subarray(
               main_state->tile->frame->chroma_cu_array,
-              offset_x / 2,
-              offset_y / 2,
-              sub_state->tile->frame->width_in_lcu * LCU_WIDTH_C,
-              sub_state->tile->frame->height_in_lcu * LCU_WIDTH_C
+              offset_x,
+              offset_y,
+              sub_state->tile->frame->width_in_lcu * LCU_WIDTH,
+              sub_state->tile->frame->height_in_lcu * LCU_WIDTH
           );
         }
       }
@@ -1949,10 +1949,9 @@ static void encoder_state_init_new_frame(encoder_state_t * const state, uvg_pict
 
   if (cfg->dual_tree && state->encoder_control->chroma_format != UVG_CSP_400 && state->frame->is_irap) {
     assert(state->tile->frame->chroma_cu_array == NULL);
-    state->tile->frame->chroma_cu_array = uvg_cu_array_chroma_alloc(
-      state->tile->frame->width / 2,
-      state->tile->frame->height / 2,
-      state->encoder_control->chroma_format
+    state->tile->frame->chroma_cu_array = uvg_cu_array_alloc(
+      state->tile->frame->width,
+      state->tile->frame->height
     );
   }
   // Set pictype.
diff --git a/src/filter.c b/src/filter.c
index cabc75e3..1fff4b55 100644
--- a/src/filter.c
+++ b/src/filter.c
@@ -273,8 +273,6 @@ static bool is_tu_boundary(
   color_t color,
   enum uvg_tree_type tree_type)
 {
-  x >>= tree_type == UVG_CHROMA_T;
-  y >>= tree_type == UVG_CHROMA_T;
   // if (x & 3 || y & 3) return false;
   const cu_info_t *const scu =
     uvg_cu_array_at_const(tree_type != UVG_CHROMA_T ? state->tile->frame->cu_array : state->tile->frame->chroma_cu_array, x, y);
@@ -1081,8 +1079,8 @@ static void filter_deblock_edge_chroma(encoder_state_t * const state,
       // CUs on both sides of the edge
       cu_info_t *cu_p;
       cu_info_t *cu_q;
-      int32_t x_coord = x << (tree_type != UVG_CHROMA_T);
-      int32_t y_coord = y << (tree_type != UVG_CHROMA_T);
+      int32_t x_coord = x << 1;
+      int32_t y_coord = y << 1;
       cu_array_t* cua = tree_type != UVG_CHROMA_T ? frame->cu_array : frame->chroma_cu_array;
       if (dir == EDGE_VER) {
         y_coord = (y + min_chroma_length * blk_idx) << (tree_type != UVG_CHROMA_T);
diff --git a/src/intra.c b/src/intra.c
index 3d2c1f81..cad654df 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -532,9 +532,8 @@ static void predict_cclm(
   const lcu_t* const lcu,
   uvg_intra_references* chroma_ref,
   uvg_pixel* dst,
-  cclm_parameters_t* cclm_params,
-  enum uvg_tree_type tree_type
-  )
+  cclm_parameters_t* cclm_params
+)
 {
   assert(mode == LM_CHROMA_IDX || mode == LM_CHROMA_L_IDX || mode == LM_CHROMA_T_IDX);
   assert(state->encoder_control->cfg.cclm);
@@ -552,17 +551,14 @@ static void predict_cclm(
 
   const uvg_pixel *y_rec = lcu->rec.y + x_scu + y_scu * LCU_WIDTH;
   const int stride2 = (((state->tile->frame->width + 7) & ~7) + FRAME_PADDING_LUMA);
-
-  tree_type = state->encoder_control->cfg.dual_tree && state->frame->slicetype == UVG_SLICE_I ? tree_type : UVG_BOTH_T;
-
-  const int ctu_size = tree_type == UVG_CHROMA_T ? LCU_WIDTH_C : LCU_WIDTH;
+  
+  const int ctu_size = LCU_WIDTH;
 
   if (y0) {
     if (y_scu == 0) available_above_right = MIN(MIN(width / 2, (64-x_scu - width * 2) / 4), (state->tile->frame->width - x0 - width* 2) / 4);
     for (; available_above_right < width / 2; available_above_right++) {
       int x_extension = x_scu + width * 2 + 4 * available_above_right;
-      x_extension >>= tree_type == UVG_CHROMA_T;
-      const cu_info_t* pu = LCU_GET_CU_AT_PX(lcu, x_extension, (y_scu >> (tree_type==UVG_CHROMA_T)) - 4);
+      const cu_info_t* pu = LCU_GET_CU_AT_PX(lcu, x_extension, (y_scu) - 4);
       if (x_extension >= ctu_size || pu->type == CU_NOTSET || (pu->type == CU_INTRA && pu->intra.mode_chroma == -1)) break;
     }
     if(y_scu == 0) {
@@ -588,8 +584,7 @@ static void predict_cclm(
     if (x_scu == 0) available_left_below = MIN(MIN(height / 2, (64 - y_scu - height * 2) / 4), (state->tile->frame->height - y0 - height * 2) / 4);
     for (; available_left_below < height / 2; available_left_below++) {
       int y_extension = y_scu + height * 2 + 4 * available_left_below;
-      y_extension >>= tree_type == UVG_CHROMA_T;
-      const cu_info_t* pu = LCU_GET_CU_AT_PX(lcu, (x_scu >> (tree_type == UVG_CHROMA_T)) - 4, y_extension);
+      const cu_info_t* pu = LCU_GET_CU_AT_PX(lcu, (x_scu) - 4, y_extension);
       if (y_extension >= ctu_size || pu->type == CU_NOTSET || (pu->type == CU_INTRA && pu->intra.mode_chroma == -1)) break;
       if(x_scu == 32 && y_scu == 0 && pu->log2_height == 6 && pu->log2_width == 6 ) break;
     }
@@ -1617,9 +1612,8 @@ void uvg_intra_predict(
   const color_t color,
   uvg_pixel* dst,
   const intra_search_data_t* data,
-  const lcu_t* lcu,
-  enum uvg_tree_type tree_type
-  )
+  const lcu_t* lcu
+)
 {
   const int stride = (((state->tile->frame->width + 7) & ~7) + FRAME_PADDING_LUMA);
   // TODO: what is this used for?
@@ -1652,8 +1646,7 @@ void uvg_intra_predict(
     if (width != 1 << data->pred_cu.log2_chroma_width || height != 1 << data->pred_cu.log2_chroma_height || data->cclm_parameters[color == COLOR_U ? 0 : 1].b <= 0) {
       predict_cclm(
         state, color, width, height, x, y, stride, intra_mode, lcu, refs, dst, 
-        (cclm_parameters_t*)&data->cclm_parameters[color == COLOR_U ? 0 : 1],
-        tree_type);
+        (cclm_parameters_t*)&data->cclm_parameters[color == COLOR_U ? 0 : 1]);
     }
     else {
       linear_transform_cclm(&data->cclm_parameters[color == COLOR_U ? 0 : 1], dst, dst, width, height);
@@ -1781,8 +1774,7 @@ static void intra_recon_tb_leaf(
   const cu_loc_t* cu_loc,
   lcu_t *lcu,
   color_t color,
-  const intra_search_data_t* search_data,
-  enum uvg_tree_type tree_type)
+  const intra_search_data_t* search_data)
 {
   const uvg_config *cfg = &state->encoder_control->cfg;
   const int shift = color == COLOR_Y ? 0 : 1;
@@ -1829,7 +1821,7 @@ static void intra_recon_tb_leaf(
   uvg_intra_build_reference(state, pu_loc, cu_loc, color, &luma_px, &pic_px, lcu, &refs, cfg->wpp, extra_refs, multi_ref_index, isp_mode);
 
   uvg_pixel pred[32 * 32];
-  uvg_intra_predict(state, &refs, cu_loc, pu_loc, color, pred, search_data, lcu, tree_type);
+  uvg_intra_predict(state, &refs, cu_loc, pu_loc, color, pred, search_data, lcu);
 
   const int index = lcu_px.x + lcu_px.y * lcu_width;
   uvg_pixel *block = NULL;
@@ -1883,12 +1875,8 @@ void uvg_intra_recon_cu(
 {
   const uint8_t depth = 6 - uvg_g_convert_to_log2[cu_loc->width];
   const vector2d_t lcu_px = {
-    cu_loc->local_x >>
-      (tree_type == UVG_CHROMA_T && state->encoder_control->cfg.dual_tree &&
-       state->frame->slicetype == UVG_SLICE_I),
-    cu_loc->local_y >>
-      (tree_type == UVG_CHROMA_T && state->encoder_control->cfg.dual_tree &&
-       state->frame->slicetype == UVG_SLICE_I),
+    cu_loc->local_x,
+    cu_loc->local_y,
   };
   const int8_t width = cu_loc->width;
   const int8_t height = cu_loc->height;
@@ -1945,7 +1933,7 @@ void uvg_intra_recon_cu(
       uvg_get_isp_split_loc(&pu_loc, cu_loc->x, cu_loc->y, width, height, i, split_type, false);
       cur_cu->intra.isp_index = 0;
       if(tu_loc.x % 4 == 0) {
-        intra_recon_tb_leaf(state, &pu_loc, cu_loc, lcu, COLOR_Y, search_data, tree_type);
+        intra_recon_tb_leaf(state, &pu_loc, cu_loc, lcu, COLOR_Y, search_data);
       }
       uvg_quantize_lcu_residual(state, true, false, false,
         &tu_loc, cur_cu, lcu,
@@ -1959,11 +1947,11 @@ void uvg_intra_recon_cu(
      
   // Process a leaf TU.
   if (has_luma) {
-    intra_recon_tb_leaf(state, cu_loc, cu_loc, lcu, COLOR_Y, search_data, tree_type);
+    intra_recon_tb_leaf(state, cu_loc, cu_loc, lcu, COLOR_Y, search_data);
   }
   if (has_chroma) {
-    intra_recon_tb_leaf(state, cu_loc, cu_loc, lcu, COLOR_U, search_data, tree_type);
-    intra_recon_tb_leaf(state, cu_loc, cu_loc, lcu, COLOR_V, search_data, tree_type);
+    intra_recon_tb_leaf(state, cu_loc, cu_loc, lcu, COLOR_U, search_data);
+    intra_recon_tb_leaf(state, cu_loc, cu_loc, lcu, COLOR_V, search_data);
   }
 
   // TODO: not necessary to call if only luma and ISP is on
@@ -2056,7 +2044,7 @@ double uvg_recon_and_estimate_cost_isp(encoder_state_t* const state,
     uvg_get_isp_split_loc(&pu_loc, cu_loc->x, cu_loc->y, width, height, i, split_type, false);
     search_data->pred_cu.intra.isp_index = 0;
     if (tu_loc.x % 4 == 0) {
-      intra_recon_tb_leaf(state, &pu_loc, cu_loc, lcu, COLOR_Y, search_data, UVG_LUMA_T);
+      intra_recon_tb_leaf(state, &pu_loc, cu_loc, lcu, COLOR_Y, search_data);
     }
     uvg_quantize_lcu_residual(state, true, false, false,
       &tu_loc, &search_data->pred_cu, lcu,
diff --git a/src/intra.h b/src/intra.h
index 71de9a6a..676588ec 100644
--- a/src/intra.h
+++ b/src/intra.h
@@ -138,9 +138,8 @@ void uvg_intra_predict(
   const color_t color,
   uvg_pixel* dst,
   const intra_search_data_t* data,
-  const lcu_t* lcu,
-  enum uvg_tree_type tree_type
-  );
+  const lcu_t* lcu
+);
 
 void uvg_intra_recon_cu(
   encoder_state_t* const state,
diff --git a/src/search.c b/src/search.c
index 8d7e343d..952fa1b8 100644
--- a/src/search.c
+++ b/src/search.c
@@ -67,10 +67,10 @@ static const int INTRA_THRESHOLD = 8;
 static INLINE void copy_cu_info(lcu_t *from, lcu_t *to, const cu_loc_t* const cu_loc, enum uvg_tree_type
                                 tree_type)
 {
-  const int y_limit = (cu_loc->local_y + cu_loc->height) >> (tree_type == UVG_CHROMA_T);
-  const int x_limit = (cu_loc->local_x + cu_loc->width) >> (tree_type == UVG_CHROMA_T);
-  for   (int y = cu_loc->local_y >> (tree_type == UVG_CHROMA_T); y < y_limit; y += SCU_WIDTH) {
-    for (int x = cu_loc->local_x >> (tree_type == UVG_CHROMA_T); x < x_limit; x += SCU_WIDTH) {
+  const int y_limit = (cu_loc->local_y + cu_loc->height);
+  const int x_limit = (cu_loc->local_x + cu_loc->width);
+  for   (int y = cu_loc->local_y ; y < y_limit; y += SCU_WIDTH) {
+    for (int x = cu_loc->local_x ; x < x_limit; x += SCU_WIDTH) {
       *LCU_GET_CU_AT_PX(to, x, y) = *LCU_GET_CU_AT_PX(from, x, y);
     }
   }
@@ -86,8 +86,8 @@ static INLINE void initialize_partial_work_tree(
   chroma_loc,
   const enum uvg_tree_type tree_type) {
 
-  const int y_limit = MIN(LCU_WIDTH,  state->tile->frame->height - cu_loc->y / 64 * 64) >> (tree_type == UVG_CHROMA_T);
-  const int x_limit = MIN(LCU_WIDTH, state->tile->frame->width - cu_loc->x / 64 * 64) >> (tree_type == UVG_CHROMA_T);
+  const int y_limit = MIN(LCU_WIDTH,  state->tile->frame->height - cu_loc->y / 64 * 64);
+  const int x_limit = MIN(LCU_WIDTH, state->tile->frame->width - cu_loc->x / 64 * 64);
 
   if (cu_loc->local_x == 0) {
     to->left_ref = from->left_ref;
@@ -150,8 +150,8 @@ static INLINE void initialize_partial_work_tree(
     
   }
 
-  const int y_start = (cu_loc->local_y >> (tree_type == UVG_CHROMA_T)) - 4;
-  const int x_start = (cu_loc->local_x >> (tree_type == UVG_CHROMA_T)) - 4;
+  const int y_start = (cu_loc->local_y) - 4;
+  const int x_start = (cu_loc->local_x) - 4;
   for (int y = y_start; y < y_limit; y += SCU_WIDTH) {
     *LCU_GET_CU_AT_PX(to, x_start, y) = *LCU_GET_CU_AT_PX(from, x_start, y);
   }
@@ -159,15 +159,15 @@ static INLINE void initialize_partial_work_tree(
     *LCU_GET_CU_AT_PX(to, x, y_start) = *LCU_GET_CU_AT_PX(from, x, y_start);
   }
 
-  for (int y = cu_loc->local_y >> (tree_type == UVG_CHROMA_T); y < y_limit; y += SCU_WIDTH) {
-    for (int x = cu_loc->local_x >> (tree_type == UVG_CHROMA_T); x < x_limit; x += SCU_WIDTH) {
+  for (int y = cu_loc->local_y; y < y_limit; y += SCU_WIDTH) {
+    for (int x = cu_loc->local_x ; x < x_limit; x += SCU_WIDTH) {
       memset(LCU_GET_CU_AT_PX(to, x, y), 0, sizeof(cu_info_t));
     }
   }
 
   if(chroma_loc->local_y != cu_loc->local_y || chroma_loc->local_x != cu_loc->local_x && tree_type == UVG_BOTH_T) {
-    const int y_start = (chroma_loc->local_y >> (tree_type == UVG_CHROMA_T)) - 4;
-    const int x_start = (chroma_loc->local_x >> (tree_type == UVG_CHROMA_T)) - 4;
+    const int y_start = (chroma_loc->local_y) - 4;
+    const int x_start = (chroma_loc->local_x) - 4;
     for (int y = y_start; y < y_limit; y += SCU_WIDTH) {
       *LCU_GET_CU_AT_PX(to, x_start, y) = *LCU_GET_CU_AT_PX(from, x_start, y);
     }
@@ -190,24 +190,24 @@ static INLINE void initialize_partial_work_tree(
       to->top_ref = from->top_ref;
       *LCU_GET_TOP_RIGHT_CU(to) = *LCU_GET_TOP_RIGHT_CU(from);      
     }
-    if (x_limit != LCU_WIDTH >> (tree_type == UVG_CHROMA_T)) {
+    if (x_limit != LCU_WIDTH) {
       for (int y = y_start; y < y_limit; y += SCU_WIDTH) {
         memset(LCU_GET_CU_AT_PX(to, x_limit, y), 0, sizeof(cu_info_t));
       }
     }
-    if (y_limit != LCU_WIDTH >> (tree_type == UVG_CHROMA_T)) {
+    if (y_limit != LCU_WIDTH) {
       for (int x = x_start; x < x_limit; x += SCU_WIDTH) {
         memset(LCU_GET_CU_AT_PX(to, x, y_limit), 0, sizeof(cu_info_t));
       }
     }
   }
   else {
-    if (x_limit != LCU_WIDTH >> (tree_type == UVG_CHROMA_T)) {
+    if (x_limit != LCU_WIDTH) {
       for (int y = y_start; y < y_limit; y += SCU_WIDTH) {
         memset(LCU_GET_CU_AT_PX(to, x_limit, y), 0, sizeof(cu_info_t));
       }
     }
-    if (y_limit != LCU_WIDTH >> (tree_type == UVG_CHROMA_T)) {
+    if (y_limit != LCU_WIDTH) {
       for (int x = x_start; x < x_limit; x += SCU_WIDTH) {
         memset(LCU_GET_CU_AT_PX(to, x, y_limit), 0, sizeof(cu_info_t));
       }
@@ -222,10 +222,10 @@ static INLINE void copy_cu_pixels(
   enum uvg_tree_type
   tree_type)
 {
-  const int x_local = cu_loc->local_x >> (tree_type == UVG_CHROMA_T);
-  const int y_local = cu_loc->local_y >> (tree_type == UVG_CHROMA_T);
+  const int x_local = cu_loc->local_x;
+  const int y_local = cu_loc->local_y;
   const int luma_index = x_local + y_local * LCU_WIDTH;
-  const int chroma_index = tree_type == UVG_CHROMA_T ? x_local + y_local * LCU_WIDTH_C : (x_local / 2) + (y_local / 2) * LCU_WIDTH_C;
+  const int chroma_index =  (x_local / 2) + (y_local / 2) * LCU_WIDTH_C;
 
   if(tree_type != UVG_CHROMA_T) {
     uvg_pixels_blit(&from->rec.y[luma_index], &to->rec.y[luma_index],
@@ -372,11 +372,11 @@ static void lcu_fill_chroma_cu_info(lcu_t *lcu, const cu_loc_t * const cu_loc)
 
 static void lcu_fill_chroma_cbfs(lcu_t *lcu, const cu_loc_t * const chroma_loc, enum uvg_tree_type tree_type)
 {
-  int8_t height = tree_type == UVG_CHROMA_T ? chroma_loc->chroma_height : chroma_loc->height;
-  int8_t width = tree_type == UVG_CHROMA_T ? chroma_loc->chroma_width : chroma_loc->width;
+  int8_t height = chroma_loc->height;
+  int8_t width =  chroma_loc->width;
   uint32_t x_local = chroma_loc->local_x;
   uint32_t y_local = chroma_loc->local_y;
-  const int offset = ~((TR_MAX_WIDTH >> (tree_type == UVG_CHROMA_T)) - 1);
+  const int offset = ~((TR_MAX_WIDTH) - 1);
   // Set coeff flags in every CU covered by part_mode in this depth.
   for (uint32_t y = 0; y < height; y += SCU_WIDTH) {
     for (uint32_t x = 0; x < width; x += SCU_WIDTH) {
@@ -728,7 +728,7 @@ static double cu_rd_cost_tr_split_accurate(
   
   const int skip_residual_coding = pred_cu->skipped || (pred_cu->type != CU_INTRA && pred_cu->cbf == 0);
   // cur_cu is used for TU parameters.
-  cu_info_t* const tr_cu = LCU_GET_CU_AT_PX(lcu, cu_loc->local_x >> (tree_type == UVG_CHROMA_T), cu_loc->local_y >> (tree_type == UVG_CHROMA_T));
+  cu_info_t* const tr_cu = LCU_GET_CU_AT_PX(lcu, cu_loc->local_x, cu_loc->local_y);
 
   double coeff_bits = 0;
   double tr_tree_bits = 0;
@@ -1132,28 +1132,28 @@ static void mark_deblocking(const cu_loc_t* const cu_loc, const cu_loc_t* const
   else {
 
     if (chroma_loc->x) {
-      for (int x = x_local; x < x_local + chroma_loc->chroma_width; x += TR_MAX_WIDTH / 2) {
-        for (int y = y_local; y < y_local + chroma_loc->chroma_height; y += SCU_WIDTH) {
+      for (int x = x_local; x < x_local + chroma_loc->width; x += TR_MAX_WIDTH) {
+        for (int y = y_local; y < y_local + chroma_loc->height; y += SCU_WIDTH) {
           LCU_GET_CU_AT_PX(lcu, x, y)->chroma_deblocking |= EDGE_VER;
         }
       }
     }
     else if(chroma_loc->width == 64) {
-      for (int y = y_local; y < y_local + chroma_loc->chroma_height; y += SCU_WIDTH) {
-        LCU_GET_CU_AT_PX(lcu, TR_MAX_WIDTH / 2, y)->chroma_deblocking |= EDGE_VER;
+      for (int y = y_local; y < y_local + chroma_loc->height; y += SCU_WIDTH) {
+        LCU_GET_CU_AT_PX(lcu, TR_MAX_WIDTH, y)->chroma_deblocking |= EDGE_VER;
       }        
     }
 
     if(chroma_loc->y) {
-      for (int y = y_local; y < y_local + chroma_loc->chroma_height; y += TR_MAX_WIDTH / 2) {
-        for (int x = x_local; x < x_local + chroma_loc->chroma_width; x += SCU_WIDTH) {
+      for (int y = y_local; y < y_local + chroma_loc->height; y += TR_MAX_WIDTH) {
+        for (int x = x_local; x < x_local + chroma_loc->width; x += SCU_WIDTH) {
           LCU_GET_CU_AT_PX(lcu, x, y)->chroma_deblocking |= EDGE_HOR;
         }
       }        
     }
     else if (chroma_loc->height == 64) {
-      for (int x = x_local; x < x_local + chroma_loc->chroma_width; x += SCU_WIDTH) {
-        LCU_GET_CU_AT_PX(lcu, x, TR_MAX_WIDTH / 2)->chroma_deblocking |= EDGE_HOR;
+      for (int x = x_local; x < x_local + chroma_loc->width; x += SCU_WIDTH) {
+        LCU_GET_CU_AT_PX(lcu, x, TR_MAX_WIDTH)->chroma_deblocking |= EDGE_HOR;
       }
     }
   }
@@ -1218,8 +1218,8 @@ static double search_cu(
   const int depth = split_tree.current_depth;
   const encoder_control_t* ctrl = state->encoder_control;
   const videoframe_t * const frame = state->tile->frame;
-  const int cu_width = tree_type != UVG_CHROMA_T ? cu_loc->width : cu_loc->chroma_width;
-  const int cu_height = tree_type != UVG_CHROMA_T ? cu_loc->height : cu_loc->chroma_height;
+  const int cu_width = cu_loc->width;
+  const int cu_height =  cu_loc->height;
   const int x = cu_loc->x;
   const int y = cu_loc->y;
   const int luma_width = cu_loc->width;
@@ -1251,8 +1251,8 @@ static double search_cu(
     int32_t max;
   } pu_depth_inter, pu_depth_intra;
   
-  int x_local = SUB_SCU(x) >> (tree_type == UVG_CHROMA_T);
-  int y_local = SUB_SCU(y) >> (tree_type == UVG_CHROMA_T);
+  int x_local = SUB_SCU(x);
+  int y_local = SUB_SCU(y);
 
   int32_t frame_width = frame->width;
   int32_t frame_height = frame->height;
@@ -1611,7 +1611,7 @@ static double search_cu(
     bits += uvg_mock_encode_coding_unit(
       state,
       cabac,
-      tree_type != UVG_CHROMA_T ? cu_loc : &separate_tree_chroma_loc,
+      cu_loc,
       is_separate_tree && !has_chroma ? NULL : chroma_loc,
       lcu,
       cur_cu,
@@ -1691,7 +1691,7 @@ static double search_cu(
   }
 
   bool can_split[6];
-  bool is_implicit = uvg_get_possible_splits(state, cu_loc, split_tree, tree_type, can_split, false);
+  bool is_implicit = uvg_get_possible_splits(state, cu_loc, split_tree, tree_type, can_split);
 
   const int slice_type = state->frame->is_irap ? (tree_type == UVG_CHROMA_T ? 2 : 0) : 1;
   const int max_btd = state->encoder_control->cfg.max_btt_depth[slice_type];
@@ -1736,7 +1736,8 @@ static double search_cu(
     for (int split_type = QT_SPLIT; split_type <= TT_VER_SPLIT; ++split_type) {
       if (!can_split[split_type] 
         || (tree_type == UVG_CHROMA_T && split_type == TT_HOR_SPLIT && cu_loc->chroma_height == 8)
-        || (tree_type == UVG_CHROMA_T && split_type == BT_HOR_SPLIT && cu_loc->chroma_height == 4))
+        || (tree_type == UVG_CHROMA_T && split_type == BT_HOR_SPLIT && cu_loc->chroma_height == 4)
+        )
         continue;
 
       if (completely_inside && check_for_early_termission(
@@ -1788,7 +1789,7 @@ static double search_cu(
           &state->search_cabac,
           left_cu,
           above_cu, 
-          tree_type != UVG_CHROMA_T ? cu_loc : &separate_tree_chroma_loc,
+          cu_loc,
           count_tree,
           tree_type,
           &is_implicit,
@@ -1834,8 +1835,8 @@ static double search_cu(
         if (split_type == QT_SPLIT && completely_inside) {
           const cu_info_t * const t = LCU_GET_CU_AT_PX(
             &split_lcu[0],
-            new_cu_loc[split].local_x >> (tree_type == UVG_CHROMA_T),
-            new_cu_loc[split].local_y >> (tree_type == UVG_CHROMA_T));
+            new_cu_loc[split].local_x,
+            new_cu_loc[split].local_y);
           stop_to_qt |= GET_SPLITDATA(t, depth + 1) == QT_SPLIT;
         }
 
@@ -2113,10 +2114,9 @@ static void copy_lcu_to_cu_data(const encoder_state_t * const state, int x_px, i
   // Copy non-reference CUs to picture.
   uvg_cu_array_copy_from_lcu(
     tree_type != UVG_CHROMA_T ? state->tile->frame->cu_array : state->tile->frame->chroma_cu_array, 
-    tree_type != UVG_CHROMA_T ? x_px : x_px / 2,
-    tree_type != UVG_CHROMA_T ? y_px : y_px / 2,
-    lcu, 
-    tree_type);
+    x_px,
+    y_px,
+    lcu);
 
   // Copy pixels to picture.
   {
diff --git a/src/search_intra.c b/src/search_intra.c
index 83ec950d..9416f122 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -678,7 +678,7 @@ static int search_intra_chroma_rough(
   for (int i = 0; i < modes_count; ++i) {
     const int8_t mode_chroma = chroma_data[i].pred_cu.intra.mode_chroma;
     if (mode_chroma == luma_mode || mode_chroma == 0 || mode_chroma >= 81) continue;
-    uvg_intra_predict(state, &refs_u, cu_loc, &loc, COLOR_U, pred, &chroma_data[i], lcu, tree_type);
+    uvg_intra_predict(state, &refs_u, cu_loc, &loc, COLOR_U, pred, &chroma_data[i], lcu);
     //costs[i] += get_cost(encoder_state, pred, orig_block, satd_func, sad_func, width);
     switch (width) {
       case 4: chroma_data[i].cost += uvg_satd_4x4(pred, orig_block);
@@ -697,7 +697,7 @@ static int search_intra_chroma_rough(
   for (int i = 0; i < modes_count; ++i) {
     const int8_t mode_chroma = chroma_data[i].pred_cu.intra.mode_chroma;
     if (mode_chroma == luma_mode || mode_chroma == 0 || mode_chroma >= 81) continue;
-    uvg_intra_predict(state, &refs_v, cu_loc, &loc, COLOR_V, pred, &chroma_data[i], lcu, tree_type);
+    uvg_intra_predict(state, &refs_v, cu_loc, &loc, COLOR_V, pred, &chroma_data[i], lcu);
     //costs[i] += get_cost(encoder_state, pred, orig_block, satd_func, sad_func, width);
     switch (width) {
       case 4: chroma_data[i].cost += uvg_satd_4x4(pred, orig_block);
@@ -1044,9 +1044,9 @@ static uint8_t search_intra_rough(
 
   int offset = 1 << state->encoder_control->cfg.intra_rough_search_levels;
   search_proxy.pred_cu.intra.mode = 0;
-  uvg_intra_predict(state, refs, cu_loc, cu_loc, COLOR_Y, preds[0], &search_proxy, NULL, UVG_LUMA_T);
+  uvg_intra_predict(state, refs, cu_loc, cu_loc, COLOR_Y, preds[0], &search_proxy, NULL);
   search_proxy.pred_cu.intra.mode = 1;
-  uvg_intra_predict(state, refs, cu_loc, cu_loc, COLOR_Y, preds[1], &search_proxy, NULL, UVG_LUMA_T);
+  uvg_intra_predict(state, refs, cu_loc, cu_loc, COLOR_Y, preds[1], &search_proxy, NULL);
   get_cost_dual(state, preds, orig_block, satd_dual_func, sad_dual_func, width, height, costs);
   mode_checked[0] = true;
   mode_checked[1] = true;
@@ -1096,7 +1096,7 @@ static uint8_t search_intra_rough(
     for (int i = 0; i < PARALLEL_BLKS; ++i) {
       if (mode + i * offset <= 66) {
         search_proxy.pred_cu.intra.mode = mode + i*offset;
-        uvg_intra_predict(state, refs, cu_loc, cu_loc, COLOR_Y, preds[i], &search_proxy, NULL, UVG_LUMA_T);
+        uvg_intra_predict(state, refs, cu_loc, cu_loc, COLOR_Y, preds[i], &search_proxy, NULL);
       }
     }
     
@@ -1168,7 +1168,7 @@ static uint8_t search_intra_rough(
       
         for (int block = 0; block < PARALLEL_BLKS; ++block) {
           search_proxy.pred_cu.intra.mode = modes_to_check[block + i];
-          uvg_intra_predict(state, refs, cu_loc, cu_loc, COLOR_Y, preds[block], &search_proxy, NULL, UVG_LUMA_T);
+          uvg_intra_predict(state, refs, cu_loc, cu_loc, COLOR_Y, preds[block], &search_proxy, NULL);
         
         }
 
@@ -1259,7 +1259,7 @@ static void get_rough_cost_for_2n_modes(
   double bits[PARALLEL_BLKS] = { 0 };
   for(int mode = 0; mode < num_modes; mode += PARALLEL_BLKS) {
     for (int i = 0; i < PARALLEL_BLKS; ++i) {
-      uvg_intra_predict(state, &refs[search_data[mode + i].pred_cu.intra.multi_ref_idx], cu_loc, cu_loc, COLOR_Y, preds[i], &search_data[mode + i], NULL, UVG_LUMA_T);
+      uvg_intra_predict(state, &refs[search_data[mode + i].pred_cu.intra.multi_ref_idx], cu_loc, cu_loc, COLOR_Y, preds[i], &search_data[mode + i], NULL);
     }
     get_cost_dual(state, preds, orig_block, satd_dual_func, sad_dual_func, width, height, costs_out);
 
@@ -1505,8 +1505,7 @@ int8_t uvg_search_intra_chroma_rdo(
             COLOR_U,
             u_pred,
             &chroma_data[mode_i],
-            lcu,
-            tree_type);
+            lcu);
           uvg_intra_predict(
             state,
             &refs[COLOR_V - 1],
@@ -1515,8 +1514,7 @@ int8_t uvg_search_intra_chroma_rdo(
             COLOR_V,
             v_pred,
             &chroma_data[mode_i],
-            lcu,
-            tree_type);
+            lcu);
           uvg_generate_residual(
             &lcu->ref.u[offset],
             u_pred,

From 91591c7e7cef8dce81e0ee78a794941ce269a333 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Thu, 23 Feb 2023 08:48:08 +0200
Subject: [PATCH 197/254] [dual-tree] Remove the limitation of not allowing 2
 height chroma blocks in dual tree

---
 src/cu.c     | 22 ----------------------
 src/cu.h     |  1 -
 src/filter.c |  6 +++---
 src/intra.c  | 32 +++++++-------------------------
 src/search.c | 10 ++--------
 5 files changed, 12 insertions(+), 59 deletions(-)

diff --git a/src/cu.c b/src/cu.c
index 56408b33..1159bc5e 100644
--- a/src/cu.c
+++ b/src/cu.c
@@ -502,25 +502,3 @@ int uvg_count_available_edge_cus(const cu_loc_t* const cu_loc, const lcu_t* cons
   }
   return MAX(amount / TR_MIN_WIDTH, cu_loc->width / TR_MIN_WIDTH);
 }
-
-int uvg_count_chroma_tree_available_edge_cus(int x, int y, int width, int height, const lcu_t* const lcu, bool left)
-{
-  if (left && x == 0 || !left && y == 0) return 0;
-  const int local_x = x % LCU_WIDTH_C;
-  const int local_y = y % LCU_WIDTH_C;
-  if (left && local_x == 0) return (LCU_WIDTH_C - local_y) / 4;
-  if (!left && local_y == 0) return width / 2;
-
-  int amount = 0;
-  if(left) {
-    while (local_y + amount < LCU_WIDTH_C && LCU_GET_CU_AT_PX(lcu, local_x - TR_MIN_WIDTH, local_y + amount)->type != CU_NOTSET) {
-      amount += TR_MIN_WIDTH;
-    }
-    return MAX(amount / TR_MIN_WIDTH, height / TR_MIN_WIDTH);
-  }
-  while (local_x + amount < LCU_WIDTH_C && LCU_GET_CU_AT_PX(lcu, local_x + amount, local_y - TR_MIN_WIDTH)->type != CU_NOTSET) {
-    amount += TR_MIN_WIDTH;
-  }
-  return MAX(amount / TR_MIN_WIDTH, width / TR_MIN_WIDTH);
-
-}
\ No newline at end of file
diff --git a/src/cu.h b/src/cu.h
index 87265ee9..46c1c4e2 100644
--- a/src/cu.h
+++ b/src/cu.h
@@ -385,7 +385,6 @@ typedef struct {
 void uvg_cu_array_copy_from_lcu(cu_array_t* dst, int dst_x, int dst_y, const lcu_t *src);
 
 int uvg_count_available_edge_cus(const cu_loc_t* const cu_loc, const lcu_t* const lcu, bool left);
-int uvg_count_chroma_tree_available_edge_cus(int x, int y, int width, int height, const lcu_t* const lcu, bool left);
 
 /**
  * \brief Return pointer to the top right reference CU.
diff --git a/src/filter.c b/src/filter.c
index 1fff4b55..a55dc619 100644
--- a/src/filter.c
+++ b/src/filter.c
@@ -1083,12 +1083,12 @@ static void filter_deblock_edge_chroma(encoder_state_t * const state,
       int32_t y_coord = y << 1;
       cu_array_t* cua = tree_type != UVG_CHROMA_T ? frame->cu_array : frame->chroma_cu_array;
       if (dir == EDGE_VER) {
-        y_coord = (y + min_chroma_length * blk_idx) << (tree_type != UVG_CHROMA_T);
+        y_coord = (y + min_chroma_length * blk_idx) << (1);
         cu_p = uvg_cu_array_at(cua, x_coord - 1, y_coord);
         cu_q = uvg_cu_array_at(cua, x_coord    , y_coord);
 
       } else {
-        x_coord = (x + min_chroma_length * blk_idx) << (tree_type != UVG_CHROMA_T);
+        x_coord = (x + min_chroma_length * blk_idx) << (1);
         cu_p = uvg_cu_array_at(cua, x_coord, y_coord - 1);
         cu_q = uvg_cu_array_at(cua, x_coord, y_coord    );
       }
@@ -1116,7 +1116,7 @@ static void filter_deblock_edge_chroma(encoder_state_t * const state,
 
 
       const bool large_boundary = (max_filter_length_P >= 3 && max_filter_length_Q >= 3);
-      const bool is_chroma_hor_CTB_boundary = (dir == EDGE_HOR && y_coord % (LCU_WIDTH >> (tree_type == UVG_CHROMA_T)) == 0);
+      const bool is_chroma_hor_CTB_boundary = (dir == EDGE_HOR && y_coord % LCU_WIDTH == 0);
       uint8_t c_strength[2] = { 0, 0 };
       
 
diff --git a/src/intra.c b/src/intra.c
index cad654df..314f44ed 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -1153,14 +1153,8 @@ void uvg_intra_build_reference_any(
       }
     }
     else {
-      if (!is_dual_tree) {
-        const int num_cus = uvg_count_available_edge_cus(cu_loc, lcu, true);
-        px_available_left = is_dual_tree || !is_chroma ? num_cus * 4 : num_cus * 2;
-      }
-      else {
-        const int num_cus = uvg_count_chroma_tree_available_edge_cus(cu_loc->x >> 1, cu_loc->y >> 1, width, height, lcu, true);
-        px_available_left = num_cus * 4;
-      }
+      const int num_cus = uvg_count_available_edge_cus(cu_loc, lcu, true);
+      px_available_left = !is_chroma ? num_cus * 4 : num_cus * 2;
     }
 
     // Limit the number of available pixels based on block size and dimensions
@@ -1282,14 +1276,8 @@ void uvg_intra_build_reference_any(
       }
     }
     else {
-      if (!is_dual_tree) {
-        const int num_cus = uvg_count_available_edge_cus(cu_loc, lcu, false);
-        px_available_top = is_dual_tree || !is_chroma ? num_cus * 4 : num_cus * 2;
-      }
-      else {
-        const int num_cus = uvg_count_chroma_tree_available_edge_cus(cu_loc->x >> 1, cu_loc->y >> 1, width, height, lcu, false);
-        px_available_top = num_cus * 4;
-      }
+      const int num_cus = uvg_count_available_edge_cus(cu_loc, lcu, false);
+      px_available_top = !is_chroma ? num_cus * 4 : num_cus * 2;
     }
     
     // Limit the number of available pixels based on block size and dimensions
@@ -1475,8 +1463,8 @@ void uvg_intra_build_reference_inner(
       const int num_cus = uvg_count_available_edge_cus(cu_loc, lcu, true);
       px_available_left = is_dual_tree || !is_chroma ? num_cus * 4 : num_cus * 2;
     } else {
-      const int num_cus = uvg_count_chroma_tree_available_edge_cus(cu_loc->x >> 1, cu_loc->y >> 1, width, height, lcu, true);
-      px_available_left = num_cus * 4;
+      const int num_cus = uvg_count_available_edge_cus(cu_loc, lcu, true);
+      px_available_left = !is_chroma ? num_cus * 4 : num_cus * 2;
     }
   }
 
@@ -1538,14 +1526,8 @@ void uvg_intra_build_reference_inner(
     }
   }
   else {
-    if (!is_dual_tree) {
       const int num_cus = uvg_count_available_edge_cus(cu_loc, lcu, false);
-      px_available_top = is_dual_tree || !is_chroma ? num_cus * 4 : num_cus * 2;
-    }
-    else {
-      const int num_cus = uvg_count_chroma_tree_available_edge_cus(cu_loc->x >> 1, cu_loc->y >> 1, width, height, lcu, false);
-      px_available_top = num_cus * 4;
-    }
+      px_available_top = !is_chroma ? num_cus * 4 : num_cus * 2;
   }
 
   // Limit the number of available pixels based on block size and dimensions
diff --git a/src/search.c b/src/search.c
index 952fa1b8..36b317ac 100644
--- a/src/search.c
+++ b/src/search.c
@@ -1274,10 +1274,7 @@ static double search_cu(
     pu_depth_intra.min = ctrl->cfg.pu_depth_intra.min[gop_layer] >= 0 ? ctrl->cfg.pu_depth_intra.min[gop_layer] : ctrl->cfg.pu_depth_intra.min[0];
     pu_depth_intra.max = ctrl->cfg.pu_depth_intra.max[gop_layer] >= 0 ? ctrl->cfg.pu_depth_intra.max[gop_layer] : ctrl->cfg.pu_depth_intra.max[0];
   }
-  if(tree_type == UVG_CHROMA_T) {
-    pu_depth_intra.max = CLIP(1, 3, pu_depth_intra.max);
-    pu_depth_intra.min = CLIP(1, 3, pu_depth_intra.min);
-  }
+
   pu_depth_inter.min = ctrl->cfg.pu_depth_inter.min[gop_layer] >= 0 ? ctrl->cfg.pu_depth_inter.min[gop_layer] : ctrl->cfg.pu_depth_inter.min[0];
   pu_depth_inter.max = ctrl->cfg.pu_depth_inter.max[gop_layer] >= 0 ? ctrl->cfg.pu_depth_inter.max[gop_layer] : ctrl->cfg.pu_depth_inter.max[0];
 
@@ -1734,10 +1731,7 @@ static double search_cu(
     memcpy(&post_seach_cabac, &state->search_cabac, sizeof(post_seach_cabac));
     // Recursively split all the way to max search depth.
     for (int split_type = QT_SPLIT; split_type <= TT_VER_SPLIT; ++split_type) {
-      if (!can_split[split_type] 
-        || (tree_type == UVG_CHROMA_T && split_type == TT_HOR_SPLIT && cu_loc->chroma_height == 8)
-        || (tree_type == UVG_CHROMA_T && split_type == BT_HOR_SPLIT && cu_loc->chroma_height == 4)
-        )
+      if (!can_split[split_type])
         continue;
 
       if (completely_inside && check_for_early_termission(

From 707e11dbcf562927272381d2527e4fb36d89ddd7 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Thu, 23 Feb 2023 12:50:30 +0200
Subject: [PATCH 198/254] [dual-tree] Small fixes

---
 src/encode_coding_tree.c | 2 +-
 src/search.c             | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index 1d121d18..6cf99a61 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -1668,7 +1668,7 @@ void uvg_encode_coding_tree(
     exit(1);
   }
   if (state->encoder_control->cabac_debug_file) {
-    fprintf(state->encoder_control->cabac_debug_file, "E %4d %4d %9d %d", x << (tree_type == UVG_CHROMA_T), y << (tree_type == UVG_CHROMA_T), split_tree.split_tree, tree_type);
+    fprintf(state->encoder_control->cabac_debug_file, "E %4d %4d %9d %d", x, y, split_tree.split_tree, tree_type);
     fwrite(&cabac->ctx, 1, sizeof(cabac->ctx), state->encoder_control->cabac_debug_file);
   }
 
diff --git a/src/search.c b/src/search.c
index 36b317ac..17aaaf8f 100644
--- a/src/search.c
+++ b/src/search.c
@@ -2179,7 +2179,7 @@ void uvg_search_lcu(encoder_state_t * const state, const int x, const int y, con
     &work_tree,
     tree_type,
     split_tree,
-    true);
+    tree_type == UVG_BOTH_T);
 
   // Save squared cost for rate control.
   if(state->encoder_control->cfg.rc_algorithm == UVG_LAMBDA) {

From 7a5245c5a4556de1ae408c3ac148874ac35c249a Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Thu, 23 Feb 2023 13:41:59 +0200
Subject: [PATCH 199/254] [dual-tree] Fix chroma tree split model context
 derivation during search

---
 src/search.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/search.c b/src/search.c
index 17aaaf8f..f7cc5bc6 100644
--- a/src/search.c
+++ b/src/search.c
@@ -1765,7 +1765,7 @@ static double search_cu(
             left_cu = LCU_GET_CU_AT_PX(lcu, x_local - 1, y_local);
           }
           else {
-            left_cu = uvg_cu_array_at_const(state->tile->frame->chroma_cu_array, (x >> 1) - 1, y >> 1);
+            left_cu = uvg_cu_array_at_const(state->tile->frame->chroma_cu_array, x - 1, y);
           }
         }
         if (y) {
@@ -1773,7 +1773,7 @@ static double search_cu(
             above_cu = LCU_GET_CU_AT_PX(lcu, x_local, y_local - 1);
           }
           else {
-            above_cu = uvg_cu_array_at_const(state->tile->frame->chroma_cu_array, x >> 1, (y >> 1) - 1);
+            above_cu = uvg_cu_array_at_const(state->tile->frame->chroma_cu_array, x, y - 1);
           }
         }
         split_tree_t count_tree = split_tree;

From 8c14fa94ba725d8be19e6201a23f7a4fa8c52914 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Thu, 2 Mar 2023 15:31:34 +0200
Subject: [PATCH 200/254] [mtt] Fix small issues with luma and chroma searches

---
 src/cu.c                             |   6 +-
 src/cu.h                             |   2 +-
 src/dep_quant.c                      |   4 +-
 src/encode_coding_tree.c             |   6 +-
 src/encoderstate.h                   |   1 +
 src/intra.c                          |  18 +--
 src/intra.h                          |   8 +-
 src/rate_control.c                   |  30 +++++
 src/rate_control.h                   |   2 +
 src/search.c                         | 100 ++++++++--------
 src/search_intra.c                   |  53 +++++----
 src/strategies/generic/dct-generic.c |   6 +-
 src/transform.c                      | 163 ++++++++++++++++++++-------
 src/transform.h                      |   4 +
 14 files changed, 273 insertions(+), 130 deletions(-)

diff --git a/src/cu.c b/src/cu.c
index 1159bc5e..d7c37108 100644
--- a/src/cu.c
+++ b/src/cu.c
@@ -100,7 +100,7 @@ cu_info_t* uvg_cu_array_at(cu_array_t *cua, unsigned x_px, unsigned y_px)
 }
 
 
-void uvg_get_isp_cu_arr_coords(int *x, int *y)
+void uvg_get_isp_cu_arr_coords(int *x, int *y, int dim)
 {
   // Do nothing if dimensions are divisible by 4
   if (*y % 4 == 0 && *x % 4 == 0) return;
@@ -109,7 +109,7 @@ void uvg_get_isp_cu_arr_coords(int *x, int *y)
 
   if (remainder_y != 0) {
     // Horizontal ISP split
-    if (remainder_y % 2 == 0) {
+    if (remainder_y % 2 == 0 && dim == 8) {
       // 8x2 block
       *y -= 2;
       *x += 4;
@@ -122,7 +122,7 @@ void uvg_get_isp_cu_arr_coords(int *x, int *y)
   }
   else {
     // Vertical ISP split
-    if (*x % 2 == 0) {
+    if (*x % 2 == 0 && dim == 8) {
       // 2x8 block
       *y += 4;
       *x -= 2;
diff --git a/src/cu.h b/src/cu.h
index 46c1c4e2..8f3ec8bf 100644
--- a/src/cu.h
+++ b/src/cu.h
@@ -245,7 +245,7 @@ typedef struct cu_array_t {
 } cu_array_t;
 
 cu_info_t* uvg_cu_array_at(cu_array_t *cua, unsigned x_px, unsigned y_px);
-void uvg_get_isp_cu_arr_coords(int* x, int* y);
+void uvg_get_isp_cu_arr_coords(int* x, int* y, int dim);
 const cu_info_t* uvg_cu_array_at_const(const cu_array_t *cua, unsigned x_px, unsigned y_px);
 
 cu_array_t * uvg_cu_array_alloc(const int width, const int height);
diff --git a/src/dep_quant.c b/src/dep_quant.c
index a41bf6c5..8cb01860 100644
--- a/src/dep_quant.c
+++ b/src/dep_quant.c
@@ -343,7 +343,7 @@ static void init_quant_block(
   const bool             needsSqrt2ScaleAdjustment,
   const int              gValue)
 {
-  double     lambda = state->lambda;
+  double     lambda = color == COLOR_Y ? state->lambda : state->c_lambda;
 
   const int  qpDQ = state->qp + 1;
   const int  qpPer = qpDQ / 6;
@@ -475,7 +475,7 @@ static void xSetLastCoeffOffset(
           cbf_ctx = &state->search_cabac.ctx.qt_cbf_model_cr[cbf_is_set(cur_tu->cbf, COLOR_U)];
           break;
       }
-      cbfDeltaBits = (int32_t)CTX_ENTROPY_BITS(cbf_ctx, 1) - (int32_t)CTX_ENTROPY_BITS(cbf_ctx, 0);
+      cbfDeltaBits = compID != COLOR_Y && cur_tu->joint_cb_cr ? 0 : (int32_t)CTX_ENTROPY_BITS(cbf_ctx, 1) - (int32_t)CTX_ENTROPY_BITS(cbf_ctx, 0);
     }
      
   }
diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index 6cf99a61..858d89f4 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -143,7 +143,7 @@ bool uvg_is_lfnst_allowed(
         uvg_get_isp_split_loc(&split_loc, cu_loc->x, cu_loc->y, cu_width, cu_height, i, isp_mode, false);
         int local_split_x = lcu ? split_loc.local_x : split_loc.x;
         int local_split_y = lcu ? split_loc.local_y : split_loc.y;
-        uvg_get_isp_cu_arr_coords(&local_split_x, &local_split_y);
+        uvg_get_isp_cu_arr_coords(&local_split_x, &local_split_y, MAX(cu_width, cu_height));
         const cu_info_t* split_cu = lcu ? LCU_GET_CU_AT_PX(lcu, local_split_x, local_split_y) :
           uvg_cu_array_at_const(frame->cu_array, local_split_x, local_split_y);
 
@@ -550,7 +550,7 @@ static void encode_transform_unit(
   cu_array_t* used_cu_array = tree_type != UVG_CHROMA_T ? frame->cu_array : frame->chroma_cu_array;
   int isp_x = x;
   int isp_y = y;
-  uvg_get_isp_cu_arr_coords(&isp_x, &isp_y);
+  uvg_get_isp_cu_arr_coords(&isp_x, &isp_y, MAX(width, height));
   if(cur_pu == NULL) {
     cur_pu = uvg_cu_array_at_const(used_cu_array, isp_x, isp_y);
   }
@@ -645,7 +645,7 @@ static void encode_transform_coeff(
   int x = cu_loc->x;
   int y = cu_loc->y;
   if (isp_split) {
-    uvg_get_isp_cu_arr_coords(&x, &y);
+    uvg_get_isp_cu_arr_coords(&x, &y, MAX(cu_loc->width, cu_loc->height));
   }
 
   //const encoder_control_t *const ctrl = state->encoder_control;
diff --git a/src/encoderstate.h b/src/encoderstate.h
index 6df843d7..7afa78ab 100644
--- a/src/encoderstate.h
+++ b/src/encoderstate.h
@@ -332,6 +332,7 @@ typedef struct encoder_state_t {
   int8_t qp;
 
   double c_lambda;
+  double chroma_weights[4];
 
   /**
    * \brief Whether a QP delta value must be coded for the current LCU.
diff --git a/src/intra.c b/src/intra.c
index 314f44ed..1b7026e5 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -2001,10 +2001,10 @@ bool uvg_can_use_isp_with_lfnst(const int width, const int height, const int isp
 
 
 double uvg_recon_and_estimate_cost_isp(encoder_state_t* const state,
-  const cu_loc_t* const cu_loc,
-  double cost_treshold,
-  intra_search_data_t* const search_data,
-  lcu_t* const lcu) {
+                                       const cu_loc_t* const cu_loc,
+                                       double cost_treshold,
+                                       intra_search_data_t* const search_data,
+                                       lcu_t* const lcu, bool* violates_lfnst) {
   assert(state->search_cabac.update && "ISP reconstruction must be done with CABAC update");
   double cost = 0;
 
@@ -2012,6 +2012,7 @@ double uvg_recon_and_estimate_cost_isp(encoder_state_t* const state,
   const int height = cu_loc->height;
 
   search_data->best_isp_cbfs = 0;
+  search_data->pred_cu.intra.isp_cbfs = 0;
   // ISP split is done horizontally or vertically depending on ISP mode, 2 or 4 times depending on block dimensions.
   // Small blocks are split only twice.
   int split_type = search_data->pred_cu.intra.isp_mode;
@@ -2020,11 +2021,11 @@ double uvg_recon_and_estimate_cost_isp(encoder_state_t* const state,
   int cbf_context = 2;
 
   for (int i = 0; i < split_limit; ++i) {
+    search_data->pred_cu.intra.isp_index = i;
     cu_loc_t tu_loc;
     uvg_get_isp_split_loc(&tu_loc, cu_loc->x, cu_loc->y, width, height, i, split_type, true);
     cu_loc_t pu_loc;
     uvg_get_isp_split_loc(&pu_loc, cu_loc->x, cu_loc->y, width, height, i, split_type, false);
-    search_data->pred_cu.intra.isp_index = 0;
     if (tu_loc.x % 4 == 0) {
       intra_recon_tb_leaf(state, &pu_loc, cu_loc, lcu, COLOR_Y, search_data);
     }
@@ -2036,20 +2037,23 @@ double uvg_recon_and_estimate_cost_isp(encoder_state_t* const state,
     int ssd = uvg_pixels_calc_ssd(&lcu->ref.y[index], &lcu->rec.y[index],
       LCU_WIDTH, LCU_WIDTH,
       tu_loc.width, tu_loc.height);
-    double coeff_bits = uvg_get_coeff_cost(state, lcu->coeff.y, NULL, &tu_loc, 0, SCAN_DIAG, false, COEFF_ORDER_CU);
+    double coeff_bits = uvg_get_coeff_cost(state, lcu->coeff.y, &search_data->pred_cu, &tu_loc, 0, SCAN_DIAG, false, COEFF_ORDER_CU);
 
 
     int cbf = cbf_is_set(search_data->pred_cu.cbf, COLOR_Y);
-    if (i + 1 != split_limit || search_data->best_isp_cbfs != 1 << (split_limit - 1)) {
+    if (i + 1 != split_limit || search_data->best_isp_cbfs != 0) {
       CABAC_FBITS_UPDATE(&state->search_cabac, &state->search_cabac.ctx.qt_cbf_model_luma[cbf_context], cbf, coeff_bits, "cbf_luma_isp_recon");
     }
     cost += ssd + coeff_bits * state->lambda;
 
     cbf_context = 2 + cbf;
+    if(violates_lfnst) *violates_lfnst |= search_data->pred_cu.violates_lfnst_constrained_luma;
+    search_data->pred_cu.violates_lfnst_constrained_luma = false;
 
     search_data->best_isp_cbfs |= cbf << i;
     search_data->pred_cu.intra.isp_cbfs = search_data->best_isp_cbfs;
 
   }
+  search_data->pred_cu.intra.isp_index = 0;
   return cost;
 }
\ No newline at end of file
diff --git a/src/intra.h b/src/intra.h
index 676588ec..c15b182a 100644
--- a/src/intra.h
+++ b/src/intra.h
@@ -152,10 +152,10 @@ void uvg_intra_recon_cu(
   bool recon_chroma);
 
 double uvg_recon_and_estimate_cost_isp(encoder_state_t* const state,
-  const cu_loc_t* const cu_loc,
-  double cost_treshold,
-  intra_search_data_t* const search_data,
-  lcu_t* const lcu);
+                                       const cu_loc_t* const cu_loc,
+                                       double cost_treshold,
+                                       intra_search_data_t* const search_data,
+                                       lcu_t* const lcu, bool* violates_lfnst);
 
 int8_t uvg_get_co_located_luma_mode(
   const cu_loc_t* const chroma_loc,
diff --git a/src/rate_control.c b/src/rate_control.c
index 0660f0ac..3dfa35fe 100644
--- a/src/rate_control.c
+++ b/src/rate_control.c
@@ -804,6 +804,11 @@ static double qp_to_lambda(encoder_state_t* const state, int qp)
   state->qp = est_qp;
   int8_t chroma_qp = encoder->qp_map[0][est_qp];
   double tmpWeight = pow(2.0, (est_qp - chroma_qp) / 3.0);
+  if (state->encoder_control->cfg.dep_quant)
+  {
+    tmpWeight *= (state->encoder_control->cfg.gop_len >= 8 ? pow(2.0, 0.1 / 3.0) : pow(2.0, 0.2 / 3.0));  // increase chroma weight for dependent quantization (in order to reduce bit rate shift from chroma to luma)
+  }
+  state->chroma_weights[1] = state->chroma_weights[2] = state->chroma_weights[3] = tmpWeight;
   state->c_lambda = est_lambda / tmpWeight;
   ctu->qp = est_qp;
   ctu->lambda = est_lambda;
@@ -1174,6 +1179,11 @@ void uvg_set_lcu_lambda_and_qp(encoder_state_t * const state,
 
   int8_t chroma_qp = ctrl->qp_map[0][state->qp];
   double tmpWeight = pow(2.0, (state->qp - chroma_qp) / 3.0);
+  if (state->encoder_control->cfg.dep_quant)
+  {
+    tmpWeight *= (state->encoder_control->cfg.gop_len >= 8 ? pow(2.0, 0.1 / 3.0) : pow(2.0, 0.2 / 3.0));  // increase chroma weight for dependent quantization (in order to reduce bit rate shift from chroma to luma)
+  }
+  state->chroma_weights[1] = state->chroma_weights[2] = state->chroma_weights[3] = tmpWeight;
   state->c_lambda = state->lambda / tmpWeight;
 
   // Apply variance adaptive quantization
@@ -1201,3 +1211,23 @@ void uvg_set_lcu_lambda_and_qp(encoder_state_t * const state,
     lcu->adjust_qp = state->qp;
   }
 }
+
+
+double uvg_calculate_chroma_lambda(encoder_state_t *state, bool use_jccr, int jccr_mode)
+{
+  const encoder_control_t * const ctrl = state->encoder_control;
+  double lambda = state->lambda;
+  int8_t chroma_qp = ctrl->qp_map[0][state->qp];
+  double tmpWeight = pow(2.0, (state->qp - chroma_qp) / 3.0);
+  if (state->encoder_control->cfg.dep_quant) {
+    tmpWeight *= (state->encoder_control->cfg.gop_len >= 8 ? pow(2.0, 0.1 / 3.0) : pow(2.0, 0.2 / 3.0)); // increase chroma weight for dependent quantization (in order to reduce bit rate shift from chroma to luma)
+  }
+  lambda /= tmpWeight;
+  lambda *= use_jccr && state->qp > 18 ? 1.3 : 1.0;
+  if (jccr_mode == 1 || jccr_mode == 2) {
+    lambda *= 0.8;
+  } else if (jccr_mode == 3) {
+    lambda *= 0.5;
+  }
+  return lambda;
+}
\ No newline at end of file
diff --git a/src/rate_control.h b/src/rate_control.h
index f397e2a2..644d7fc4 100644
--- a/src/rate_control.h
+++ b/src/rate_control.h
@@ -76,4 +76,6 @@ void uvg_set_ctu_qp_lambda(encoder_state_t * const state, vector2d_t pos);
 void uvg_update_after_picture(encoder_state_t * const state);
 void uvg_estimate_pic_lambda(encoder_state_t * const state);
 
+double uvg_calculate_chroma_lambda(encoder_state_t *state, bool use_jccr, int jccr_mode);
+
 #endif // RATE_CONTROL_H_
diff --git a/src/search.c b/src/search.c
index f7cc5bc6..b1ac6944 100644
--- a/src/search.c
+++ b/src/search.c
@@ -43,6 +43,7 @@
 #include "imagelist.h"
 #include "inter.h"
 #include "intra.h"
+#include "rate_control.h"
 #include "uvg266.h"
 #include "rdo.h"
 #include "search_inter.h"
@@ -731,7 +732,8 @@ static double cu_rd_cost_tr_split_accurate(
   cu_info_t* const tr_cu = LCU_GET_CU_AT_PX(lcu, cu_loc->local_x, cu_loc->local_y);
 
   double coeff_bits = 0;
-  double tr_tree_bits = 0;
+  double luma_bits = 0;
+  double chroma_bits = 0;
   
   const int cb_flag_u = tr_cu->joint_cb_cr ? tr_cu->joint_cb_cr >> 1 : cbf_is_set(tr_cu->cbf, COLOR_U);
   const int cb_flag_v = tr_cu->joint_cb_cr ? tr_cu->joint_cb_cr & 1 : cbf_is_set(tr_cu->cbf, COLOR_V);
@@ -743,7 +745,7 @@ static double cu_rd_cost_tr_split_accurate(
     // Only need to signal coded block flag if not skipped or merged
     // skip = no coded residual, merge = coded residual
     if (pred_cu->type != CU_INTRA && (!pred_cu->merged)) {
-      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_qt_root_cbf_model), cbf, tr_tree_bits, "rqt_root_cbf");
+      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_qt_root_cbf_model), cbf, luma_bits, "rqt_root_cbf");
     }
 
   }
@@ -768,13 +770,13 @@ static double cu_rd_cost_tr_split_accurate(
     for (int i = 0; i < split_count; ++i) {
       sum += cu_rd_cost_tr_split_accurate(state, pred_cu, lcu, tree_type, isp_cbf, &split_cu_loc[i], chroma_loc ? &split_chroma_cu_loc[i] : NULL, has_chroma);
     }
-    return sum + tr_tree_bits * state->lambda;
+    return sum + luma_bits * state->lambda;
   }
 
   has_chroma = state->encoder_control->chroma_format != UVG_CSP_400 && has_chroma && tree_type != UVG_LUMA_T;
   if (!skip_residual_coding && has_chroma) {
-    CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_cbf_model_cb[0]), cb_flag_u, tr_tree_bits, "cbf_cb");  
-    CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_cbf_model_cr[cb_flag_u]), cb_flag_v, tr_tree_bits, "cbf_cr");    
+    CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_cbf_model_cb[0]), cb_flag_u, chroma_bits, "cbf_cb");  
+    CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_cbf_model_cr[cb_flag_u]), cb_flag_v, chroma_bits, "cbf_cr");    
   }
 
   const int cb_flag_y = cbf_is_set(tr_cu->cbf, COLOR_Y) && tree_type != UVG_CHROMA_T;
@@ -791,7 +793,7 @@ static double cu_rd_cost_tr_split_accurate(
     {
       cabac_ctx_t* ctx = &(cabac->ctx.qt_cbf_model_luma[0]);
 
-      CABAC_FBITS_UPDATE(cabac, ctx, cb_flag_y, tr_tree_bits, "cbf_y_search");
+      CABAC_FBITS_UPDATE(cabac, ctx, cb_flag_y, luma_bits, "cbf_y_search");
     }
   }
   else {
@@ -802,7 +804,7 @@ static double cu_rd_cost_tr_split_accurate(
     for (int i = 0; i < split_limit; i++) {
       if (i != split_limit_minus_one || isp_cbf != 1 << split_limit_minus_one) {
         const int flag = (isp_cbf >> i) & 1;
-        CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_cbf_model_luma[luma_ctx]), flag, tr_tree_bits, "cbf_y_search");
+        CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_cbf_model_luma[luma_ctx]), flag, luma_bits, "cbf_y_search");
         luma_ctx = 2 + flag;
       }
     }
@@ -812,7 +814,7 @@ static double cu_rd_cost_tr_split_accurate(
     // TODO qp_delta_sign_flag
 
     if ((cb_flag_u || cb_flag_v) && has_chroma && state->encoder_control->cfg.jccr) {
-      CABAC_FBITS_UPDATE(cabac, &cabac->ctx.joint_cb_cr[cb_flag_u * 2 + cb_flag_v - 1], tr_cu->joint_cb_cr != 0, tr_tree_bits, "tu_joint_cbcr_residual_flag");
+      CABAC_FBITS_UPDATE(cabac, &cabac->ctx.joint_cb_cr[cb_flag_u * 2 + cb_flag_v - 1], tr_cu->joint_cb_cr != 0, chroma_bits, "tu_joint_cbcr_residual_flag");
     }
   }
 
@@ -834,7 +836,7 @@ static double cu_rd_cost_tr_split_accurate(
 
   if(cb_flag_y || is_isp){
     if (can_use_tr_skip) {
-      CABAC_FBITS_UPDATE(cabac, &cabac->ctx.transform_skip_model_luma, tr_cu->tr_idx == MTS_SKIP, tr_tree_bits, "transform_skip_flag");
+      CABAC_FBITS_UPDATE(cabac, &cabac->ctx.transform_skip_model_luma, tr_cu->tr_idx == MTS_SKIP, luma_bits, "transform_skip_flag");
     }
     int8_t luma_scan_mode = SCAN_DIAG;
     if (pred_cu->type == CU_INTER || pred_cu->intra.isp_mode == ISP_MODE_NO_ISP) {
@@ -872,14 +874,14 @@ static double cu_rd_cost_tr_split_accurate(
         cabac,
         &cabac->ctx.lfnst_idx_model[1],
         lfnst_idx != 0,
-        tr_tree_bits,
+        luma_bits,
         "lfnst_idx");
       if (lfnst_idx > 0) {
         CABAC_FBITS_UPDATE(
           cabac,
           &cabac->ctx.lfnst_idx_model[2],
           lfnst_idx == 2,
-          tr_tree_bits,
+          luma_bits,
           "lfnst_idx");
       }
     }
@@ -903,38 +905,34 @@ static double cu_rd_cost_tr_split_accurate(
       if (!state->encoder_control->cfg.lossless) {
         int index = lcu_px.y * LCU_WIDTH_C + lcu_px.x;
         unsigned ssd_u = uvg_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index],
-          LCU_WIDTH_C, LCU_WIDTH_C,
-          chroma_width, chroma_height);
+          LCU_WIDTH_C, LCU_WIDTH_C, chroma_width, chroma_height) * state->chroma_weights[1];
         unsigned ssd_v = uvg_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index],
-          LCU_WIDTH_C, LCU_WIDTH_C,
-          chroma_width, chroma_height);
+          LCU_WIDTH_C, LCU_WIDTH_C, chroma_width, chroma_height) * state->chroma_weights[2];
         chroma_ssd = ssd_u + ssd_v;
       }
       if(chroma_can_use_tr_skip && cb_flag_u) {
-        CABAC_FBITS_UPDATE(cabac, &cabac->ctx.transform_skip_model_chroma, tr_cu->tr_skip & 2, tr_tree_bits, "transform_skip_flag");        
+        CABAC_FBITS_UPDATE(cabac, &cabac->ctx.transform_skip_model_chroma, tr_cu->tr_skip & 2, chroma_bits, "transform_skip_flag");        
       }
       if(chroma_can_use_tr_skip && cb_flag_v) {
-        CABAC_FBITS_UPDATE(cabac, &cabac->ctx.transform_skip_model_chroma, tr_cu->tr_skip & 4, tr_tree_bits, "transform_skip_flag");        
+        CABAC_FBITS_UPDATE(cabac, &cabac->ctx.transform_skip_model_chroma, tr_cu->tr_skip & 4, chroma_bits, "transform_skip_flag");        
       }
-      coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.u, tr_cu, &temp_chroma_loc, COLOR_U, scan_order, tr_cu->tr_skip & 2, COEFF_ORDER_CU);
-      coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.v, tr_cu, &temp_chroma_loc, COLOR_V, scan_order, tr_cu->tr_skip & 4, COEFF_ORDER_CU);
+      chroma_bits += uvg_get_coeff_cost(state, lcu->coeff.u, tr_cu, &temp_chroma_loc, COLOR_U, scan_order, tr_cu->tr_skip & 2, COEFF_ORDER_CU);
+      chroma_bits += uvg_get_coeff_cost(state, lcu->coeff.v, tr_cu, &temp_chroma_loc, COLOR_V, scan_order, tr_cu->tr_skip & 4, COEFF_ORDER_CU);
       
     }
     else {
       {
         int index = lcu_px.y * LCU_WIDTH_C + lcu_px.x;
         int ssd_u_joint = uvg_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index],
-          LCU_WIDTH_C, LCU_WIDTH_C,
-          chroma_width, chroma_height);
+          LCU_WIDTH_C, LCU_WIDTH_C, chroma_width, chroma_height) * state->chroma_weights[3];
         int ssd_v_joint = uvg_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index],
-          LCU_WIDTH_C, LCU_WIDTH_C,
-          chroma_width, chroma_height);
+          LCU_WIDTH_C, LCU_WIDTH_C, chroma_width, chroma_height) * state->chroma_weights[3];
         chroma_ssd = ssd_u_joint + ssd_v_joint;
       }
       if (chroma_can_use_tr_skip) {
-        CABAC_FBITS_UPDATE(cabac, &cabac->ctx.transform_skip_model_chroma, tr_cu->tr_skip & 2, tr_tree_bits, "transform_skip_flag");
+        CABAC_FBITS_UPDATE(cabac, &cabac->ctx.transform_skip_model_chroma, tr_cu->tr_skip & 2, chroma_bits, "transform_skip_flag");
       }
-      coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.joint_uv, tr_cu, &temp_chroma_loc, COLOR_U, scan_order, 0, COEFF_ORDER_CU);
+      chroma_bits += uvg_get_coeff_cost(state, lcu->coeff.joint_uv, tr_cu, &temp_chroma_loc, COLOR_U, scan_order, 0, COEFF_ORDER_CU);
     }
   }
 
@@ -945,14 +943,14 @@ static double cu_rd_cost_tr_split_accurate(
       cabac,
       &cabac->ctx.lfnst_idx_model[is_chroma_tree],
       lfnst_idx != 0,
-      tr_tree_bits,
+      luma_bits,
       "lfnst_idx");
     if (lfnst_idx > 0) {
       CABAC_FBITS_UPDATE(
         cabac,
         &cabac->ctx.lfnst_idx_model[2],
         lfnst_idx == 2,
-        tr_tree_bits,
+        luma_bits,
         "lfnst_idx");
     }
   }
@@ -963,20 +961,20 @@ static double cu_rd_cost_tr_split_accurate(
 
     bool symbol = tr_cu->tr_idx != 0;
     int ctx_idx = 0;
-    CABAC_FBITS_UPDATE(cabac, &cabac->ctx.mts_idx_model[ctx_idx], symbol, tr_tree_bits, "mts_idx");
+    CABAC_FBITS_UPDATE(cabac, &cabac->ctx.mts_idx_model[ctx_idx], symbol, luma_bits, "mts_idx");
 
     ctx_idx++;
     for (int i = 0; i < 3 && symbol; i++, ctx_idx++)
     {
       symbol = tr_cu->tr_idx > i + MTS_DST7_DST7 ? 1 : 0;
-      CABAC_FBITS_UPDATE(cabac, &cabac->ctx.mts_idx_model[ctx_idx], symbol, tr_tree_bits, "mts_idx");
+      CABAC_FBITS_UPDATE(cabac, &cabac->ctx.mts_idx_model[ctx_idx], symbol, luma_bits, "mts_idx");
     }
     tr_cu->mts_last_scan_pos = false;
     tr_cu->violates_mts_coeff_constraint = false;
   }
 
-  double bits = tr_tree_bits + coeff_bits;
-  return luma_ssd * UVG_LUMA_MULT + chroma_ssd * UVG_CHROMA_MULT + bits * state->lambda;
+  double bits = luma_bits + coeff_bits;
+  return luma_ssd * UVG_LUMA_MULT + chroma_ssd * UVG_CHROMA_MULT  + (bits + chroma_bits) * state->lambda;
 }
 
 
@@ -1378,7 +1376,8 @@ static double search_cu(
             cu_loc,
             0,
             &intra_search,
-            lcu
+            lcu,
+            NULL
           );
           memcpy(&state->search_cabac, &temp_cabac, sizeof(cabac_data_t));
         }
@@ -1478,20 +1477,23 @@ static double search_cu(
         recon_chroma = false; 
       }
       lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_height, cur_cu);
-      uvg_intra_recon_cu(state,
-                         &intra_search, cu_loc,
-                         NULL, lcu,
-                         tree_type, 
-                         recon_luma, recon_chroma);
       if (!state->encoder_control->cfg.cclm && cur_cu->intra.isp_mode != ISP_MODE_NO_ISP) {
         uvg_recon_and_estimate_cost_isp(
           state,
           cu_loc,
           0,
           &intra_search,
-          lcu
+          lcu,
+          NULL
         );
       }
+      else {
+        uvg_intra_recon_cu(state,
+          &intra_search, cu_loc,
+          NULL, lcu,
+          tree_type,
+          recon_luma, recon_chroma);        
+      }
 
 
       if((!recon_chroma && state->encoder_control->chroma_format != UVG_CSP_400 && tree_type != UVG_LUMA_T) 
@@ -1518,7 +1520,7 @@ static double search_cu(
 
       // Set isp split cbfs here
       const int split_type = intra_search.pred_cu.intra.isp_mode;
-      const int split_num = split_type == ISP_MODE_NO_ISP ? 0 : uvg_get_isp_split_num(cu_width, cu_height, split_type, true);
+      const int split_num = split_type == ISP_MODE_NO_ISP || tree_type == UVG_CHROMA_T ? 0 : uvg_get_isp_split_num(cu_width, cu_height, split_type, true);
 
       const int cbf_cb = cbf_is_set(cur_cu->cbf, COLOR_U);
       const int cbf_cr = cbf_is_set(cur_cu->cbf, COLOR_V);
@@ -1530,7 +1532,7 @@ static double search_cu(
         // Fetch proper x, y coords for isp blocks
         int tmp_x = isp_loc.x;
         int tmp_y = isp_loc.y;
-        uvg_get_isp_cu_arr_coords(&tmp_x, &tmp_y);
+        uvg_get_isp_cu_arr_coords(&tmp_x, &tmp_y, MAX(cu_width, cu_height));
         cu_info_t* split_cu = LCU_GET_CU_AT_PX(lcu, tmp_x % LCU_WIDTH, tmp_y % LCU_WIDTH);
         bool cur_cbf = (intra_search.best_isp_cbfs >> i) & 1;
         cbf_clear(&split_cu->cbf, COLOR_Y);
@@ -1733,6 +1735,13 @@ static double search_cu(
     for (int split_type = QT_SPLIT; split_type <= TT_VER_SPLIT; ++split_type) {
       if (!can_split[split_type])
         continue;
+      split_tree_t new_split = {
+        split_tree.split_tree | split_type << (split_tree.current_depth * 3),
+        split_tree.current_depth + 1,
+        split_tree.mtt_depth + (split_type != QT_SPLIT),
+        split_tree.implicit_mtt_depth + (split_type != QT_SPLIT && is_implicit),
+        0
+      };
 
       if (completely_inside && check_for_early_termission(
             cu_width,
@@ -1798,13 +1807,6 @@ static double search_cu(
         continue;
       }
 
-      split_tree_t new_split = {
-        split_tree.split_tree | split_type << (split_tree.current_depth * 3),
-        split_tree.current_depth + 1,
-        split_tree.mtt_depth + (split_type != QT_SPLIT),
-        split_tree.implicit_mtt_depth + (split_type != QT_SPLIT && is_implicit),
-        0
-      };
 
       state->search_cabac.update = 0;
       split_cost += split_bits * state->lambda;
@@ -2166,7 +2168,9 @@ void uvg_search_lcu(encoder_state_t * const state, const int x, const int y, con
   }
 
   int tree_type = state->frame->slicetype == UVG_SLICE_I
-  && state->encoder_control->cfg.dual_tree ? UVG_LUMA_T : UVG_BOTH_T;
+                  && state->encoder_control->cfg.dual_tree
+                    ? UVG_LUMA_T
+                    : UVG_BOTH_T;
 
   cu_loc_t start;
   uvg_cu_loc_ctor(&start, x, y, LCU_WIDTH, LCU_WIDTH);
diff --git a/src/search_intra.c b/src/search_intra.c
index 9416f122..2e507f95 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -365,6 +365,7 @@ static double search_intra_trdepth(
     for (trafo = mts_start; trafo < num_transforms; trafo++) {
       for (int lfnst_idx = start_idx; lfnst_idx <= end_lfnst_idx; lfnst_idx++) {
         // Initialize lfnst variables
+        search_data->best_isp_cbfs = 0;
         pred_cu->tr_idx = trafo;
         pred_cu->tr_skip = trafo == MTS_SKIP;
         pred_cu->lfnst_idx = lfnst_idx;
@@ -400,8 +401,10 @@ static double search_intra_trdepth(
             cu_loc,
             cost_treshold,
             search_data,
-            lcu
+            lcu,
+            &constraints[0]
           );
+          constraints[1] = search_data->best_isp_cbfs != 0;
         }
         else {
           uvg_intra_recon_cu(
@@ -427,7 +430,7 @@ static double search_intra_trdepth(
           }
         }
         
-        if (trafo != MTS_SKIP && end_lfnst_idx != 0) {
+        if (trafo != MTS_SKIP && end_lfnst_idx != 0 && pred_cu->intra.isp_mode == ISP_MODE_NO_ISP) {
           uvg_derive_lfnst_constraints(
             pred_cu,
             constraints,
@@ -438,7 +441,7 @@ static double search_intra_trdepth(
             COLOR_Y);
         }
 
-        if (!constraints[1] && (cbf_is_set(pred_cu->cbf, COLOR_Y) || pred_cu->intra.isp_mode != ISP_MODE_NO_ISP)) {
+        if (!constraints[1] && cbf_is_set(pred_cu->cbf, COLOR_Y)) {
           //end_idx = 0;
           if (pred_cu->lfnst_idx > 0) {
             continue;
@@ -456,8 +459,8 @@ static double search_intra_trdepth(
         }
         double transform_bits = 0;
         if (state->encoder_control->cfg.lfnst && PU_IS_TU(pred_cu) &&
-          trafo != MTS_SKIP && end_lfnst_idx != 0) {
-          if ((!constraints[0] && constraints[1]) || lfnst_idx != 0) {
+          trafo != MTS_SKIP && end_lfnst_idx != 0 && (cbf_is_set(pred_cu->cbf, COLOR_Y) || search_data->best_isp_cbfs != 0)) {
+          if ((!constraints[0] && (constraints[1] || pred_cu->intra.isp_mode != ISP_MODE_NO_ISP))) {
             transform_bits += CTX_ENTROPY_FBITS(
               &state->search_cabac.ctx.lfnst_idx_model[tree_type == UVG_LUMA_T],
               lfnst_idx != 0);
@@ -469,6 +472,7 @@ static double search_intra_trdepth(
           }
         }
         if (num_transforms > 2 && trafo != MTS_SKIP
+            && (cbf_is_set(pred_cu->cbf, COLOR_Y) || search_data->best_isp_cbfs != 0)
             && pred_cu->intra.isp_mode == ISP_MODE_NO_ISP
             && lfnst_idx == 0
             && width <= 32
@@ -952,8 +956,9 @@ static INLINE double count_bits(
   const double not_mpm_mode_bit,
   const double planar_mode_flag,
   const double not_planar_mode_flag,
+  const double not_isp_flag,
   int8_t mode
-  )
+)
 {
   int i = 0;
   int smaller_than_pred = 0;
@@ -975,7 +980,7 @@ static INLINE double count_bits(
   else {
     bits = not_mpm_mode_bit + 5 + (mode - smaller_than_pred > 2);
   }
-  bits += not_mrl + not_mip;
+  bits += not_mrl + not_mip + not_isp_flag;
   return bits;
 }
 
@@ -1023,13 +1028,14 @@ static uint8_t search_intra_rough(
     int8_t mode;
     double cost;
   };
-
+  
   const double not_mrl = state->encoder_control->cfg.mrl && (cu_loc->y % LCU_WIDTH) ? CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.multi_ref_line[0]), 0) : 0;
   const double not_mip = state->encoder_control->cfg.mip ? CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.mip_flag[mip_ctx]), 0) : 0;
   const double mpm_mode_bit = CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.intra_luma_mpm_flag_model), 1);
   const double not_mpm_mode_bit = CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.intra_luma_mpm_flag_model), 0);
   const double planar_mode_flag = CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.luma_planar_model[1]), 0);
   const double not_planar_mode_flag = CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.luma_planar_model[1]), 1);
+  const double not_isp_flag = state->encoder_control->cfg.isp && uvg_can_use_isp(width, height) ? CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.intra_subpart_model[0]), 0) : 0;
 
   const uint8_t mode_list_size = state->encoder_control->cfg.mip ? 6 : 3;
   struct mode_cost best_six_modes[6];
@@ -1059,7 +1065,7 @@ static uint8_t search_intra_rough(
     not_mpm_mode_bit,
     planar_mode_flag,
     not_planar_mode_flag,
-    0) * state->lambda_sqrt;
+    not_isp_flag, 0) * state->lambda_sqrt;
   costs[1] += count_bits(
     state,
     intra_preds,
@@ -1069,7 +1075,7 @@ static uint8_t search_intra_rough(
     not_mpm_mode_bit,
     planar_mode_flag,
     not_planar_mode_flag,
-    1) * state->lambda_sqrt;
+    not_isp_flag, 1) * state->lambda_sqrt;
   if(costs[0] < costs[1]) {
     min_cost = costs[0];
     max_cost = costs[1];
@@ -1113,7 +1119,7 @@ static uint8_t search_intra_rough(
           not_mpm_mode_bit,
           planar_mode_flag,
           not_planar_mode_flag,
-          mode + i * offset) * state->lambda_sqrt;
+          not_isp_flag, mode + i * offset) * state->lambda_sqrt;
       }
     }
 
@@ -1184,7 +1190,7 @@ static uint8_t search_intra_rough(
               not_mpm_mode_bit,
               planar_mode_flag,
               not_planar_mode_flag,
-              modes_to_check[block + i]) * state->lambda_sqrt;
+              not_isp_flag, modes_to_check[block + i]) * state->lambda_sqrt;
           
         }
 
@@ -1327,7 +1333,8 @@ static int8_t search_intra_rdo(
   
   for (int mode = 0; mode < modes_to_check; mode++) {
     bool can_do_isp_search = search_data[mode].pred_cu.intra.mip_flag ? false : true; // Cannot use ISP with MIP
-    can_do_isp_search = search_data[mode].pred_cu.intra.multi_ref_idx == 0 ? can_do_isp_search : false; // Cannot use ISP with MRL
+    // can_do_isp_search = search_data[mode].pred_cu.intra.multi_ref_idx == 0 ? can_do_isp_search : false; // Cannot use ISP with MRL
+    const uint8_t mrl_idx = search_data[mode].pred_cu.intra.multi_ref_idx;
     double best_isp_cost = MAX_DOUBLE;
     double best_bits = MAX_DOUBLE;
     int8_t best_isp_mode = 0;
@@ -1340,6 +1347,7 @@ static int8_t search_intra_rdo(
        
 
       search_data[mode].pred_cu.intra.isp_mode = isp_mode;
+      search_data[mode].pred_cu.intra.multi_ref_idx = isp_mode == ISP_MODE_NO_ISP ? mrl_idx : 0;
       double rdo_bitcost = uvg_luma_mode_bits(state, &search_data[mode].pred_cu, cu_loc, lcu);
       search_data[mode].pred_cu.tr_idx = MTS_TR_NUM;
       search_data[mode].bits = rdo_bitcost;
@@ -1362,6 +1370,7 @@ static int8_t search_intra_rdo(
     search_data[mode].cost = best_isp_cost;
     search_data[mode].bits = best_bits;
     search_data[mode].pred_cu.intra.isp_mode = best_isp_mode;
+    search_data[mode].pred_cu.intra.multi_ref_idx = best_isp_mode == ISP_MODE_NO_ISP ? mrl_idx : 0;
     search_data[mode].pred_cu.tr_idx = best_mts_mode_for_isp[best_isp_mode];
     search_data[mode].pred_cu.tr_skip = best_mts_mode_for_isp[best_isp_mode] == MTS_SKIP;
     search_data[mode].pred_cu.lfnst_idx = best_lfnst_mode_for_isp[best_isp_mode];
@@ -1482,11 +1491,13 @@ int8_t uvg_search_intra_chroma_rdo(
     ALIGNED(64) int16_t u_resi[LCU_WIDTH_C * LCU_WIDTH_C];
     ALIGNED(64) int16_t v_resi[LCU_WIDTH_C * LCU_WIDTH_C];
 
+    double original_c_lambda = state->c_lambda;
 
     for (int8_t mode_i = 0; mode_i < num_modes; ++mode_i) {
       const uint8_t mode = chroma_data[mode_i].pred_cu.intra.mode_chroma;
       double mode_bits = uvg_chroma_mode_bits(state, mode, luma_mode);
-      chroma_data[mode_i].cost = mode_bits * state->lambda;
+      chroma_data[mode_i].cost = mode_bits * state->c_lambda;
+      chroma_data[mode_i].bits = mode_bits;
       cu_info_t* pred_cu = &chroma_data[mode_i].pred_cu;
       uint8_t best_lfnst_index = 0;
       for (int lfnst_i = 0; lfnst_i < 3; ++lfnst_i) {
@@ -1494,9 +1505,10 @@ int8_t uvg_search_intra_chroma_rdo(
         if (lfnst == -1) {
           continue;
         }
+        state->c_lambda = original_c_lambda * (state->encoder_control->cfg.jccr && state->qp > 18 ? 1.3 : 1.0);
         pred_cu->cr_lfnst_idx = lfnst;
-        chroma_data[mode_i].lfnst_costs[lfnst] += mode_bits * state->lambda;
-        if (PU_IS_TU(pred_cu) && (tree_type != UVG_CHROMA_T || (pred_cu->log2_width < 5 && pred_cu->log2_height < 5))) {
+        chroma_data[mode_i].lfnst_costs[lfnst] += mode_bits * state->c_lambda;
+        if (PU_IS_TU(pred_cu) && (tree_type != UVG_CHROMA_T || (pred_cu->log2_chroma_width < 5 && pred_cu->log2_chroma_height < 5))) {
           uvg_intra_predict(
             state,
             &refs[COLOR_U - 1],
@@ -1552,8 +1564,9 @@ int8_t uvg_search_intra_chroma_rdo(
             continue;
           }
 
+          double actual_cost = state->lambda * (chorma_ts_out.u_bits + chorma_ts_out.v_bits + mode_bits) + (chorma_ts_out.u_distortion + chorma_ts_out.v_distortion);
           if(chorma_ts_out.best_u_cost + chorma_ts_out.best_v_cost < chorma_ts_out.best_combined_cost) {
-            chroma_data[mode_i].lfnst_costs[lfnst] += chorma_ts_out.best_u_cost + chorma_ts_out.best_v_cost;
+            chroma_data[mode_i].lfnst_costs[lfnst] = actual_cost;
             if( chroma_data[mode_i].lfnst_costs[lfnst] 
                 < chroma_data[mode_i].lfnst_costs[best_lfnst_index] || lfnst_i == 0) {
               chroma_data[mode_i].pred_cu.joint_cb_cr = 0;
@@ -1565,7 +1578,7 @@ int8_t uvg_search_intra_chroma_rdo(
             }
           }
           else {
-            chroma_data[mode_i].lfnst_costs[lfnst] += chorma_ts_out.best_combined_cost;
+            chroma_data[mode_i].lfnst_costs[lfnst] = actual_cost;
             if (chroma_data[mode_i].lfnst_costs[lfnst]
               < chroma_data[mode_i].lfnst_costs[best_lfnst_index] || lfnst_i == 0) {
               chroma_data[mode_i].pred_cu.joint_cb_cr = chorma_ts_out.best_combined_index;
@@ -1574,10 +1587,11 @@ int8_t uvg_search_intra_chroma_rdo(
               chroma_data[mode_i].cost = chroma_data[mode_i].lfnst_costs[lfnst];
             }
           }
+
         }
         else {
           state->search_cabac.update = 1;
-          chroma_data[mode_i].cost = mode_bits * state->lambda;
+          chroma_data[mode_i].cost = mode_bits * state->c_lambda;
           uvg_intra_recon_cu(state,
                              &chroma_data[mode_i], cu_loc,
                              pred_cu, lcu,
@@ -1593,6 +1607,7 @@ int8_t uvg_search_intra_chroma_rdo(
     }
     sort_modes(chroma_data, num_modes);
     
+    state->c_lambda = original_c_lambda;
     return chroma_data[0].pred_cu.intra.mode_chroma;
   }
 
diff --git a/src/strategies/generic/dct-generic.c b/src/strategies/generic/dct-generic.c
index 2a673d21..ccddf17a 100644
--- a/src/strategies/generic/dct-generic.c
+++ b/src/strategies/generic/dct-generic.c
@@ -2608,7 +2608,7 @@ static void mts_dct_generic(
     if (height == 1) {
       dct_hor(input, output, shift_1st, height, 0, skip_width);
     } else if (width == 1) {
-      dct_ver(input, output, shift_2nd, width, 0, skip_height);
+      dct_ver(input, output, log2_height_minus1 + 1 + bitdepth + 6 - 15, width, 0, skip_height);
     } else {
       dct_hor(input, tmp, shift_1st, height, 0, skip_width);
       dct_ver(tmp, output, shift_2nd, width, skip_width, skip_height);
@@ -2666,9 +2666,9 @@ static void mts_idct_generic(
     const int32_t shift_2nd = (transform_matrix_shift + max_log2_tr_dynamic_range - 1) - bitdepth;
 
     if (height == 1) {
-      idct_hor(input, output, shift_1st, height, 0, skip_width);
+      idct_hor(input, output, shift_2nd + 1, height, 0, skip_width);
     } else if (width == 1) {
-      idct_ver(input, output, shift_2nd, width, 0, skip_height);
+      idct_ver(input, output, shift_2nd + 1, width, 0, skip_height);
     } else {
       idct_ver(input, tmp, shift_1st, width, skip_width, skip_height);
       idct_hor(tmp, output, shift_2nd, height, 0, skip_width);
diff --git a/src/transform.c b/src/transform.c
index 783d9f2b..58051a87 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -37,6 +37,7 @@
 #include "intra.h"
 #include "uvg266.h"
 #include "lfnst_tables.h"
+#include "rate_control.h"
 #include "rdo.h"
 #include "strategies/strategies-dct.h"
 #include "strategies/strategies-quant.h"
@@ -362,7 +363,7 @@ static void generate_jccr_transforms(
         }
       }
     }
-    costs[jccr] = d2 != 0 ? MIN(d1, d2) : d1;
+    costs[jccr] = jccr == 0 ? MIN(d1, d2) : d1;
   }
   int64_t min_dist1 = costs[0];
   int64_t min_dist2 = INT64_MAX;
@@ -418,8 +419,7 @@ static void generate_jccr_transforms(
 static void quantize_chroma(
   encoder_state_t* const state,
   cu_info_t * const cur_tu,
-  int8_t width,
-  int8_t height,
+  const cu_loc_t* const cu_loc,
   coeff_t u_coeff[5120],
   coeff_t v_coeff[2048],
   enum uvg_chroma_transforms transform,
@@ -428,9 +428,13 @@ static void quantize_chroma(
   const coeff_scan_order_t scan_order,
   bool* u_has_coeffs,
   bool* v_has_coeffs,
-  uint8_t lfnst_idx, 
-  enum uvg_tree_type tree_type)
+  uint8_t lfnst_idx,
+  enum uvg_tree_type tree_type,
+  double* u_coeff_cost,
+  double* v_coeff_cost)
 {
+  int8_t width = cu_loc->chroma_width;
+  int8_t height = cu_loc->chroma_height;
   if(state->encoder_control->cfg.dep_quant && transform != CHROMA_TS) {
     int abs_sum = 0;
     uvg_dep_quant(
@@ -445,10 +449,23 @@ static void quantize_chroma(
       &abs_sum,
       state->encoder_control->cfg.scaling_list
     );
+
+    cbf_clear(&cur_tu->cbf, COLOR_U);
     if (abs_sum > 0) {
       *u_has_coeffs = 1;
       cbf_set(&cur_tu->cbf, COLOR_U);
     }
+
+    *u_coeff_cost = uvg_get_coeff_cost(
+      state,
+      u_quant_coeff,
+      cur_tu,
+      cu_loc,
+      COLOR_U,
+      SCAN_DIAG,
+      false,
+      COEFF_ORDER_LINEAR);
+
     if (transform == DCT7_CHROMA) {
       abs_sum = 0;
       uvg_dep_quant(
@@ -463,10 +480,24 @@ static void quantize_chroma(
         &abs_sum,
         state->encoder_control->cfg.scaling_list
       );
+
+      cbf_clear(&cur_tu->cbf, COLOR_V);
       if (abs_sum > 0) {
         *v_has_coeffs = 1;
+        cbf_set(&cur_tu->cbf, COLOR_V);
       }
+
+      *v_coeff_cost = uvg_get_coeff_cost(
+        state,
+        v_quant_coeff,
+        cur_tu,
+        cu_loc,
+        COLOR_V,
+        SCAN_DIAG,
+        false,
+        COEFF_ORDER_LINEAR);
       cbf_clear(&cur_tu->cbf, COLOR_U);
+      cbf_clear(&cur_tu->cbf, COLOR_V);
     }
     return;
   }
@@ -580,6 +611,9 @@ void uvg_chroma_transform_search(
       trans_offset,
       &num_transforms);
   }
+
+  double lambda = state->c_lambda;
+
   chorma_ts_out->best_u_cost = MAX_DOUBLE;
   chorma_ts_out->best_v_cost = MAX_DOUBLE;
   chorma_ts_out->best_combined_cost = MAX_DOUBLE;
@@ -600,11 +634,27 @@ void uvg_chroma_transform_search(
         uvg_fwd_lfnst(pred_cu, width, height, COLOR_V, pred_cu->cr_lfnst_idx, &v_coeff[i * trans_offset], tree_type, state->collocated_luma_mode);
       }
     }
+    uint8_t old_jccr = pred_cu->joint_cb_cr;
+    pred_cu->joint_cb_cr = 0;
+    if(is_jccr) {
+      state->c_lambda = lambda *  (transforms[i] == JCCR_3 ? 0.5 : 0.8);
+      pred_cu->joint_cb_cr = transforms[i];
+    }
+    else if(state->encoder_control->cfg.dep_quant) {
+      state->search_cabac.update = 1;
+    }
+
+    double u_coeff_cost = 0;
+    double v_coeff_cost = 0;
+    unsigned ssd_u = 0;
+    unsigned ssd_v = 0;
+    double   u_bits = 0;
+    double   v_bits = 0;
+
     quantize_chroma(
       state,
       pred_cu,
-      width,
-      height,
+      cu_loc,
       &u_coeff[i * trans_offset],
       &v_coeff[i * trans_offset],
       transforms[i],
@@ -612,8 +662,12 @@ void uvg_chroma_transform_search(
       v_quant_coeff,
       SCAN_DIAG,
       &u_has_coeffs,
-      &v_has_coeffs, tree_type == UVG_CHROMA_T ?  pred_cu->cr_lfnst_idx : pred_cu->lfnst_idx, tree_type);
-    if(pred_cu->cr_lfnst_idx !=0 && !u_has_coeffs && !v_has_coeffs) continue;
+      &v_has_coeffs, tree_type == UVG_CHROMA_T ?  pred_cu->cr_lfnst_idx : pred_cu->lfnst_idx, 
+      tree_type,
+      &u_coeff_cost,
+      &v_coeff_cost);
+    pred_cu->joint_cb_cr = old_jccr;
+    if (pred_cu->cr_lfnst_idx != 0 && !u_has_coeffs && !v_has_coeffs) goto reset_cabac;
     
     if(pred_cu->type == CU_INTRA && transforms[i] != CHROMA_TS && tree_type == UVG_CHROMA_T) {
       bool constraints[2] = { false, false };
@@ -621,10 +675,10 @@ void uvg_chroma_transform_search(
       if(!is_jccr) {
         uvg_derive_lfnst_constraints(pred_cu, constraints, v_quant_coeff, width, height, NULL, COLOR_V);
       }
-      if (!constraints[1] && (u_has_coeffs || v_has_coeffs) && pred_cu->cr_lfnst_idx != 0) continue;
+      if (!constraints[1] && (u_has_coeffs || v_has_coeffs) && pred_cu->cr_lfnst_idx != 0) goto reset_cabac;
     }
 
-    if (is_jccr && !u_has_coeffs) continue;
+    if (is_jccr && !u_has_coeffs) goto reset_cabac;
 
     if (u_has_coeffs) {
       uvg_dequant(state, u_quant_coeff, &u_coeff[i * trans_offset], width, height, transforms[i] != JCCR_1 ? COLOR_U : COLOR_V,
@@ -697,8 +751,6 @@ void uvg_chroma_transform_search(
       uvg_pixels_blit(v_pred, &v_recon[trans_offset * i], width, height, width, width);
     }
 
-    unsigned ssd_u = 0;
-    unsigned ssd_v = 0;
     if (!state->encoder_control->cfg.lossless) {
       ssd_u = uvg_pixels_calc_ssd(&lcu->ref.u[offset], &u_recon[trans_offset * i],
         LCU_WIDTH_C, width,
@@ -706,10 +758,10 @@ void uvg_chroma_transform_search(
       ssd_v = uvg_pixels_calc_ssd(&lcu->ref.v[offset], &v_recon[trans_offset * i],
         LCU_WIDTH_C, width,
         width, height);
+      ssd_u = (double)ssd_u * state->chroma_weights[1];
+      ssd_v = (double)ssd_v * state->chroma_weights[2];
     }
 
-    double u_bits = 0;
-    double v_bits = 0;
     state->search_cabac.update = 1;
 
     int cbf_u = transforms[i] & 2 || (u_has_coeffs && !(transforms[i] & 1));
@@ -733,16 +785,17 @@ void uvg_chroma_transform_search(
           transforms[i] == CHROMA_TS, u_bits, "tr_skip_u"
         );
       }
-      double coeff_cost = uvg_get_coeff_cost(
-        state,
-        u_quant_coeff,
-        pred_cu,
-        cu_loc,
-        COLOR_U,
-        SCAN_DIAG,
-        transforms[i] == CHROMA_TS,
-        COEFF_ORDER_LINEAR);
-      u_bits += coeff_cost;
+      if(u_coeff_cost == 0) {
+        u_coeff_cost = uvg_get_coeff_cost(
+          state,
+          u_quant_coeff,
+          pred_cu,
+          cu_loc,
+          COLOR_U,
+          SCAN_DIAG,
+          transforms[i] == CHROMA_TS,
+          COEFF_ORDER_LINEAR);
+      }
     }
     if (cbf_v && !is_jccr) {
       if (can_use_tr_skip) {
@@ -750,16 +803,20 @@ void uvg_chroma_transform_search(
           transforms[i] == CHROMA_TS, v_bits, "tr_skip_v"
         );
       }
-      v_bits += uvg_get_coeff_cost(
-        state,
-        v_quant_coeff,
-        pred_cu,
-        cu_loc,
-        COLOR_V,
-        SCAN_DIAG,
-        transforms[i] == CHROMA_TS,
-        COEFF_ORDER_LINEAR);
+      if (v_coeff_cost == 0) {
+        v_coeff_cost = uvg_get_coeff_cost(
+          state,
+          v_quant_coeff,
+          pred_cu,
+          cu_loc,
+          COLOR_V,
+          SCAN_DIAG,
+          transforms[i] == CHROMA_TS,
+          COEFF_ORDER_LINEAR);
+      }
     }
+    u_bits += u_coeff_cost;
+    v_bits += v_coeff_cost;
     if((depth == 4 || tree_type == UVG_CHROMA_T) && state->encoder_control->cfg.lfnst && 0) {
       if(uvg_is_lfnst_allowed(state, pred_cu, UVG_CHROMA_T, COLOR_UV, cu_loc, lcu)) {
         const int lfnst_idx = pred_cu->cr_lfnst_idx;
@@ -781,25 +838,35 @@ void uvg_chroma_transform_search(
       pred_cu->lfnst_last_scan_pos = false;
       pred_cu->violates_lfnst_constrained_chroma = false;
     }
+
     if (!is_jccr) {
-      double u_cost = UVG_CHROMA_MULT * ssd_u + u_bits * state->c_lambda;
-      double v_cost = UVG_CHROMA_MULT * ssd_v + v_bits * state->c_lambda;
+      double u_cost = UVG_CHROMA_MULT * ssd_u + u_bits * state->lambda;
+      double v_cost = UVG_CHROMA_MULT * ssd_v + v_bits * state->lambda;
       if (u_cost < chorma_ts_out->best_u_cost) {
         chorma_ts_out->best_u_cost = u_cost;
         chorma_ts_out->best_u_index = u_has_coeffs ? transforms[i] : NO_RESIDUAL;
+        chorma_ts_out->u_bits = u_bits;
+        chorma_ts_out->u_distortion = ssd_u;
       }
       if (v_cost < chorma_ts_out->best_v_cost) {
         chorma_ts_out->best_v_cost = v_cost;
         chorma_ts_out->best_v_index = v_has_coeffs ? transforms[i] : NO_RESIDUAL;
+        chorma_ts_out->v_bits = v_bits;
+        chorma_ts_out->v_distortion = ssd_v;
       }
     }
     else {
-      double cost = UVG_CHROMA_MULT * (ssd_u + ssd_v) + (u_bits + v_bits) * state->c_lambda;
-      if (cost < chorma_ts_out->best_combined_cost) {
+      double cost = UVG_CHROMA_MULT * (ssd_u + ssd_v) + (u_bits + v_bits) * state->lambda;
+      if (cost < chorma_ts_out->best_combined_cost && cost < chorma_ts_out->best_u_cost + chorma_ts_out->best_v_cost) {
         chorma_ts_out->best_combined_cost = cost;
         chorma_ts_out->best_combined_index = transforms[i];
+        chorma_ts_out->u_bits              = u_bits;
+        chorma_ts_out->u_distortion        = ssd_u;
+        chorma_ts_out->v_bits              = v_bits;
+        chorma_ts_out->v_distortion        = ssd_v;
       }
     }
+reset_cabac:
     memcpy(&state->search_cabac, temp_cabac, sizeof(cabac_data_t));
   }
 }
@@ -1493,9 +1560,24 @@ void uvg_quantize_lcu_residual(
     if (luma) {
       quantize_tr_residual(state, COLOR_Y, &loc, cur_pu, lcu, early_skip, tree_type);
     }
+    double c_lambda = state->c_lambda;
+    state->c_lambda = uvg_calculate_chroma_lambda(state, state->encoder_control->cfg.jccr, cur_pu->joint_cb_cr);
     if (chroma) {
-      quantize_tr_residual(state, COLOR_U, &loc, cur_pu, lcu, early_skip, tree_type);
-      quantize_tr_residual(state, COLOR_V, &loc, cur_pu, lcu, early_skip, tree_type);   
+      if(state->encoder_control->cfg.dep_quant) {
+        cabac_data_t temp_cabac;
+        memcpy(&temp_cabac, &state->search_cabac, sizeof(cabac_data_t));
+        state->search_cabac.update = 1;
+        quantize_tr_residual(state, COLOR_U, &loc, cur_pu, lcu, early_skip, tree_type);
+        cu_loc_t temp_chroma_loc;
+        uvg_cu_loc_ctor(&temp_chroma_loc, (cu_loc->x >> 1) % LCU_WIDTH_C, (cu_loc->y >> 1) % LCU_WIDTH_C, cu_loc->width, cu_loc->height);
+        uvg_get_coeff_cost(state, lcu->coeff.u, NULL, &temp_chroma_loc, COLOR_U, 0, (cur_pu->tr_skip & 2) >> 1, COEFF_ORDER_CU);
+        quantize_tr_residual(state, COLOR_V, &loc, cur_pu, lcu, early_skip, tree_type);
+        memcpy(&state->search_cabac, &temp_cabac, sizeof(cabac_data_t));
+      }
+      else {
+        quantize_tr_residual(state, COLOR_U, &loc, cur_pu, lcu, early_skip, tree_type);
+        quantize_tr_residual(state, COLOR_V, &loc, cur_pu, lcu, early_skip, tree_type);
+      }
     }
     if (jccr && PU_IS_TU(cur_pu)) {
       quantize_tr_residual(state, COLOR_UV, &loc, cur_pu, lcu, early_skip, tree_type);
@@ -1503,5 +1585,6 @@ void uvg_quantize_lcu_residual(
     if(chroma && jccr && PU_IS_TU(cur_pu)) {
       assert( 0 && "Trying to quantize both jccr and regular at the same time.\n");
     }
+    state->c_lambda = c_lambda;
   }
 }
diff --git a/src/transform.h b/src/transform.h
index d2b95ca8..be485f46 100644
--- a/src/transform.h
+++ b/src/transform.h
@@ -88,6 +88,10 @@ typedef struct {
   int best_u_index;
   int best_v_index;
   int best_combined_index;
+  uint64_t u_distortion;
+  uint64_t v_distortion;
+  double   u_bits;
+  double   v_bits;
 } uvg_chorma_ts_out_t;
 
 void uvg_quantize_lcu_residual(

From 805afb1331245f8792ee9c6a10d766d4f4a643e0 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 5 Apr 2023 09:33:00 +0300
Subject: [PATCH 201/254] [fix] Minor fixes

---
 src/encoderstate.c                     |  2 +-
 src/strategies/generic/intra-generic.c | 34 ++++++++++++++------------
 2 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/src/encoderstate.c b/src/encoderstate.c
index e8a43548..78c9c9f2 100644
--- a/src/encoderstate.c
+++ b/src/encoderstate.c
@@ -1304,7 +1304,7 @@ static void encoder_state_encode(encoder_state_t * const main_state) {
             sub_state->tile->frame->width_in_lcu * LCU_WIDTH,
             sub_state->tile->frame->height_in_lcu * LCU_WIDTH
         );
-        if(main_state->encoder_control->cfg.dual_tree){
+        if(main_state->encoder_control->cfg.dual_tree && main_state->frame->is_irap){
           sub_state->tile->frame->chroma_cu_array = uvg_cu_subarray(
               main_state->tile->frame->chroma_cu_array,
               offset_x,
diff --git a/src/strategies/generic/intra-generic.c b/src/strategies/generic/intra-generic.c
index e00ac48a..398388fc 100644
--- a/src/strategies/generic/intra-generic.c
+++ b/src/strategies/generic/intra-generic.c
@@ -190,11 +190,28 @@ static void uvg_angular_pred_generic(
   if (!vertical_mode) { SWAP(width, height, int) }
 
   if (sample_disp != 0) {
+    bool use_cubic = true; // Default to cubic filter
+    static const int uvg_intra_hor_ver_dist_thres[8] = { 24, 24, 24, 14, 2, 0, 0, 0 };
+    int filter_threshold = uvg_intra_hor_ver_dist_thres[(log2_width + log2_height) >> 1];
+    int dist_from_vert_or_hor = MIN(abs((int32_t)pred_mode - 50), abs((int32_t)pred_mode - 18));
+    if (dist_from_vert_or_hor > filter_threshold) {
+      if ((abs(sample_disp) & 0x1F) != 0)
+      {
+        use_cubic = false;
+      }
+    }
+    // Cubic must be used if ref line != 0 or if isp mode is != 0
+    if (multi_ref_index || isp) {
+      use_cubic = true;
+    }
     // The mode is not horizontal or vertical, we have to do interpolation.
 
     for (int_fast32_t y = 0, delta_pos = sample_disp * (1 + multi_ref_index); y < height; ++y, delta_pos += sample_disp) {
+
       int_fast32_t delta_int = delta_pos >> 5;
       int_fast32_t delta_fract = delta_pos & (32 - 1);
+      const int16_t filter_coeff[4] = { 16 - (delta_fract >> 1), 32 - (delta_fract >> 1), 16 + (delta_fract >> 1), delta_fract >> 1 };
+      int16_t const* const f = use_cubic ? cubic_filter[delta_fract] : filter_coeff;
 
       if ((abs(sample_disp) & 0x1F) != 0) {
         
@@ -202,22 +219,7 @@ static void uvg_angular_pred_generic(
         if (channel_type == 0) {
           int32_t ref_main_index = delta_int;
           uvg_pixel p[4];
-          bool use_cubic = true; // Default to cubic filter
-          static const int uvg_intra_hor_ver_dist_thres[8] = { 24, 24, 24, 14, 2, 0, 0, 0 };
-          int filter_threshold = uvg_intra_hor_ver_dist_thres[(log2_width + log2_height) >> 1];
-          int dist_from_vert_or_hor = MIN(abs((int32_t)pred_mode - 50), abs((int32_t)pred_mode - 18));
-          if (dist_from_vert_or_hor > filter_threshold) {
-            if ((abs(sample_disp) & 0x1F) != 0)
-            {
-              use_cubic = false;
-            }
-          }
-          // Cubic must be used if ref line != 0 or if isp mode is != 0
-          if (multi_ref_index || isp) {
-            use_cubic = true;
-          }
-          const int16_t filter_coeff[4] = { 16 - (delta_fract >> 1), 32 - (delta_fract >> 1), 16 + (delta_fract >> 1), delta_fract >> 1 };
-          int16_t const * const f = use_cubic ? cubic_filter[delta_fract] : filter_coeff;
+
           // Do 4-tap intra interpolation filtering
           for (int_fast32_t x = 0; x < width; x++, ref_main_index++) {
             p[0] = ref_main[ref_main_index];

From 312ac6731c7912b333b3fb768b948e18bd17e470 Mon Sep 17 00:00:00 2001
From: Marko Viitanen <fador@iki.fi>
Date: Tue, 15 Aug 2023 13:24:22 +0300
Subject: [PATCH 202/254] [ibc] dual-tree rebase fixes

---
 src/search.c     |  5 ++---
 src/search_ibc.c | 25 ++++++++-----------------
 2 files changed, 10 insertions(+), 20 deletions(-)

diff --git a/src/search.c b/src/search.c
index b1ac6944..45c8459f 100644
--- a/src/search.c
+++ b/src/search.c
@@ -1591,8 +1591,7 @@ static double search_cu(
       lcu_fill_cbf(lcu, x_local, y_local, cu_width, cu_height, cur_cu, UVG_BOTH_T);
     }
   }
-
-  if (cur_cu->type == CU_INTRA || cur_cu->type == CU_INTER || cur_cu->type == CU_IBC) {
+  
   // The cabac functions assume chroma locations whereas the search uses luma locations
   // for the chroma tree, therefore we need to shift the chroma coordinates here for
   // passing to the bit cost calculating functions.
@@ -1602,7 +1601,7 @@ static double search_cu(
   separate_tree_chroma_loc.width >>= 1;
   separate_tree_chroma_loc.height >>= 1;
 
-  if (cur_cu->type == CU_INTRA || cur_cu->type == CU_INTER) {
+  if (cur_cu->type == CU_INTRA || cur_cu->type == CU_INTER || cur_cu->type == CU_IBC) {
     double bits = 0;
     cabac_data_t* cabac  = &state->search_cabac;
     cabac->update = 1;
diff --git a/src/search_ibc.c b/src/search_ibc.c
index b7067c8c..d916d986 100644
--- a/src/search_ibc.c
+++ b/src/search_ibc.c
@@ -922,19 +922,18 @@ static void search_pu_ibc(
         cur_pu->inter.mv_dir    = info->merge_cand[merge_idx].dir;
         cur_pu->inter.mv[0][0]  = info->merge_cand[merge_idx].mv[0][0];
         cur_pu->inter.mv[0][1]  = info->merge_cand[merge_idx].mv[0][1];
-        uvg_lcu_fill_trdepth(lcu, cu_loc->x, cu_loc->y, depth, MAX(1, depth), UVG_BOTH_T);
         uvg_inter_recon_cu(state, lcu, true, false, cu_loc);
-        uvg_quantize_lcu_residual(state, true, false, false, cu_loc, depth, cur_pu, lcu, true, UVG_BOTH_T);
+        uvg_quantize_lcu_residual(state, true, false, false, cu_loc, cur_pu, lcu, true, UVG_BOTH_T);
 
-        if (cbf_is_set(cur_pu->cbf, depth, COLOR_Y)) {
+        if (cbf_is_set(cur_pu->cbf, COLOR_Y)) {
           continue;
         }
         else if (has_chroma) {
           uvg_inter_recon_cu(state, lcu, false, has_chroma, cu_loc);
           uvg_quantize_lcu_residual(state, false, has_chroma, 
             false, /*we are only checking for lack of coeffs so no need to check jccr*/
-            cu_loc, depth, cur_pu, lcu, true, UVG_BOTH_T);
-          if (!cbf_is_set_any(cur_pu->cbf, depth)) {
+            cu_loc, cur_pu, lcu, true, UVG_BOTH_T);
+          if (!cbf_is_set_any(cur_pu->cbf)) {
             cur_pu->type = CU_IBC;
             cur_pu->merge_idx = merge_idx;
             cur_pu->skipped = true;
@@ -1112,8 +1111,6 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state,
   info.num_merge_cand = uvg_inter_get_merge_cand(
     state,
     cu_loc,
-    merge_a1,
-    merge_b1,
     info.merge_cand,
     lcu);
 
@@ -1128,11 +1125,6 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state,
   static int    evaluations = 0;
   static int hits = 0;
 
-
-  UVG_CLOCK_T   hashmap_start_temp;
-  UVG_CLOCK_T   hashmap_end_temp;
-
-
   UVG_CLOCK_T   hashmap_start_real_time;
   UVG_CLOCK_T   hashmap_end_real_time;
   UVG_GET_TIME(&hashmap_start_real_time);
@@ -1194,7 +1186,7 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state,
           if (full_block) {
             double     cost = ibc_cost, bits = ibc_bitcost;
             vector2d_t mv = { best_mv_x, best_mv_y};
-            cost = calc_ibc_mvd_cost(state, mv_x, mv_y,INTERNAL_MV_PREC,info.mv_cand, info.merge_cand, info.num_merge_cand, NULL, &bits);
+            cost = calc_ibc_mvd_cost(state, mv_x, mv_y,INTERNAL_MV_PREC,info.mv_cand, info.merge_cand, info.num_merge_cand, 0, &bits);
             //double cost    = get_ibc_mvd_coding_cost(state, &state->cabac, mv_x,mv_y) * state->lambda_sqrt;
             //cost += 
             bool better_mv = cost < ibc_cost;
@@ -1221,7 +1213,7 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state,
   //if (x > state->tile->frame->width-64 && y > state->tile->frame->height-64)
     //fprintf(stderr, "Hashmap time: %f (crc: %f, search: %f) Evaluations: %d Hits: %d, hashed in this block: %d\n", time_spent,crc_time, search_time, evaluations, hits,hashes_found);
    
-  if (!found_block) return;
+  if (!found_block) return 0;
 
   *inter_cost    = ibc_cost;
   *inter_bitcost = ibc_bitcost;
@@ -1250,7 +1242,7 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state,
   cur_pu->skipped = merged;
   
 
-  const int ibc_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.ibc_flag[0], 1);
+  const float ibc_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.ibc_flag[0], 1);
   ibc_cost += ibc_flag * state->lambda;
   ibc_bitcost += ibc_flag;
 
@@ -1267,7 +1259,7 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state,
       cur_pu->inter.mv[0][0],
       cur_pu->inter.mv[0][1]));
   }
-
+  return 1;
 }
 
 
@@ -1313,7 +1305,6 @@ void uvg_search_cu_ibc(encoder_state_t * const state,
 
   search_pu_ibc(state,
                   cu_loc,
-                  SIZE_2Nx2N, 0,
                   amvp,
                   &merge,
                   &info);

From 02395727966253b1dcd42f70c2f402cac79ee123 Mon Sep 17 00:00:00 2001
From: Marko Viitanen <fador@iki.fi>
Date: Wed, 23 Aug 2023 15:21:45 +0300
Subject: [PATCH 203/254] [ibc] Fix some instances where CU_INTER was checked
 instead of !CU_INTRA

---
 src/search.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/search.c b/src/search.c
index 45c8459f..755062ab 100644
--- a/src/search.c
+++ b/src/search.c
@@ -781,7 +781,7 @@ static double cu_rd_cost_tr_split_accurate(
 
   const int cb_flag_y = cbf_is_set(tr_cu->cbf, COLOR_Y) && tree_type != UVG_CHROMA_T;
 
-  const bool is_isp = !(pred_cu->type == CU_INTER || pred_cu->intra.isp_mode == ISP_MODE_NO_ISP);
+  const bool is_isp = !(pred_cu->type != CU_INTRA || pred_cu->intra.isp_mode == ISP_MODE_NO_ISP);
   // Add transform_tree cbf_luma bit cost.
   if (!is_isp) {
     const int is_tr_split = cu_loc->width > TR_MAX_WIDTH || cu_loc->height > TR_MAX_WIDTH;
@@ -839,7 +839,7 @@ static double cu_rd_cost_tr_split_accurate(
       CABAC_FBITS_UPDATE(cabac, &cabac->ctx.transform_skip_model_luma, tr_cu->tr_idx == MTS_SKIP, luma_bits, "transform_skip_flag");
     }
     int8_t luma_scan_mode = SCAN_DIAG;
-    if (pred_cu->type == CU_INTER || pred_cu->intra.isp_mode == ISP_MODE_NO_ISP) {
+    if (pred_cu->type != CU_INTRA || pred_cu->intra.isp_mode == ISP_MODE_NO_ISP) {
       //const coeff_t* coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, x_px, y_px)];
       const coeff_t* coeffs = lcu->coeff.y;
 
@@ -1290,7 +1290,7 @@ static double search_cu(
     cur_cu->log2_chroma_width = uvg_g_convert_to_log2[chroma_loc->chroma_width];
   }
 
-  intra_search_data_t intra_search;
+  intra_search_data_t intra_search = {0};
 
   const bool completely_inside = x + luma_width <= frame_width && y + luma_height <= frame_height;
   // If the CU is completely inside the frame at this depth, search for

From 26ef1dda09ea196774cbaa27029e6e93dd0d3305 Mon Sep 17 00:00:00 2001
From: Marko Viitanen <fador@iki.fi>
Date: Wed, 30 Aug 2023 15:06:08 +0300
Subject: [PATCH 204/254] [ibc] Fix chroma SAD handling and disable chroma SAD
 for now

---
 src/search_ibc.c                    | 39 ++++++++++++++++++-----------
 src/strategies/strategies-picture.c |  1 +
 2 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/src/search_ibc.c b/src/search_ibc.c
index d916d986..2d80ec28 100644
--- a/src/search_ibc.c
+++ b/src/search_ibc.c
@@ -75,7 +75,8 @@ typedef struct {
    * \brief Possible optimized SAD implementation for the width, leave as
    *        NULL for arbitrary-width blocks
    */
-  optimized_sad_func_ptr_t optimized_sad;
+  optimized_sad_func_ptr_t optimized_sad_y;
+  optimized_sad_func_ptr_t optimized_sad_uv;
 
   lcu_t                   *lcu;
 
@@ -166,12 +167,15 @@ static uint32_t calculate_ibc_cost_satd(const encoder_state_t *state, lcu_t* lcu
 }
 
 
-static uint32_t calculate_ibc_cost_sad(const encoder_state_t *state, optimized_sad_func_ptr_t optimized_sad, lcu_t* lcu, const cu_loc_t* loc, int32_t mv_x, int32_t mv_y)
+static uint32_t calculate_ibc_cost_sad(ibc_search_info_t *info, const cu_loc_t* loc, int32_t mv_x, int32_t mv_y)
 {  
   const uint32_t x = loc->x;
   const uint32_t y = loc->y;
-  cu_info_t *cur_cu    = LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y));
-
+  lcu_t         *lcu    = info->lcu;
+  cu_info_t     *cur_cu     = LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y));
+  
+  const encoder_state_t* state = info->state;
+  
   cu_info_t cu_backup  = *cur_cu;
   uint32_t       cost  = MAX_INT;
 
@@ -195,19 +199,22 @@ static uint32_t calculate_ibc_cost_sad(const encoder_state_t *state, optimized_s
   
   *cur_cu = cu_backup;
 
-  if (optimized_sad != NULL) {
-    cost = optimized_sad(lcu->rec.y + offset, &state->tile->frame->source->y[y * state->tile->frame->source->stride + x], width, LCU_WIDTH, state->tile->frame->source->stride);
-    if(state->encoder_control->chroma_format != UVG_CSP_400) {
-      cost += optimized_sad(lcu->rec.u + offset_c, &state->tile->frame->source->u[(y / 2) * state->tile->frame->source->stride / 2 + x / 2], width / 2, LCU_WIDTH_C, state->tile->frame->source->stride / 2);
-      cost += optimized_sad(lcu->rec.v + offset_c, &state->tile->frame->source->v[(y / 2) * state->tile->frame->source->stride / 2 + x / 2], width / 2, LCU_WIDTH_C, state->tile->frame->source->stride / 2);
-    }
+  if (info->optimized_sad_y != NULL) {
+    cost = info->optimized_sad_y(lcu->rec.y + offset, &state->tile->frame->source->y[y * state->tile->frame->source->stride + x], width, LCU_WIDTH, state->tile->frame->source->stride);
   } else {
     cost = uvg_reg_sad(lcu->rec.y + offset, &state->tile->frame->source->y[y * state->tile->frame->source->stride + x], width,width, LCU_WIDTH, state->tile->frame->source->stride);
-    if(state->encoder_control->chroma_format != UVG_CSP_400) {
+  }
+
+  // ToDo: Enable chroma cost calculation
+  /* if (state->encoder_control->chroma_format != UVG_CSP_400) {
+    if (info->optimized_sad_uv != NULL) {    
+      cost += info->optimized_sad_uv(lcu->rec.u + offset_c, &state->tile->frame->source->u[(y / 2) * state->tile->frame->source->stride / 2 + x / 2], width / 2, LCU_WIDTH_C, state->tile->frame->source->stride / 2);
+      cost += info->optimized_sad_uv(lcu->rec.v + offset_c, &state->tile->frame->source->v[(y / 2) * state->tile->frame->source->stride / 2 + x / 2], width / 2, LCU_WIDTH_C, state->tile->frame->source->stride / 2);
+    } else {
       cost += uvg_reg_sad(lcu->rec.u + offset_c, &state->tile->frame->source->u[(y / 2) * state->tile->frame->source->stride / 2 + x / 2], width / 2, width / 2, LCU_WIDTH_C, state->tile->frame->source->stride / 2);
       cost += uvg_reg_sad(lcu->rec.v + offset_c, &state->tile->frame->source->v[(y / 2) * state->tile->frame->source->stride / 2 + x / 2], width / 2, width / 2, LCU_WIDTH_C, state->tile->frame->source->stride / 2);
     }
-  }
+  }*/
 
   return cost;
 }
@@ -247,7 +254,7 @@ static bool check_mv_cost(ibc_search_info_t *info,
   uvg_cu_loc_ctor(&loc, info->origin.x, info->origin.y, info->width, info->height);
 
 
-  cost = calculate_ibc_cost_sad(info->state, info->optimized_sad, info->lcu, &loc, x, y);
+  cost = calculate_ibc_cost_sad(info, &loc, x, y);
 
   if (cost >= *best_cost) return false;
 
@@ -826,7 +833,8 @@ static void search_pu_ibc(
   info->height   = height_cu;
   info->mvd_cost_func =
     cfg->mv_rdo ? uvg_calc_ibc_mvd_cost_cabac : calc_ibc_mvd_cost;
-  info->optimized_sad = uvg_get_optimized_sad(width_cu);
+  info->optimized_sad_y  = uvg_get_optimized_sad(width_cu);
+  info->optimized_sad_uv = uvg_get_optimized_sad(cu_loc->chroma_width);
   info->lcu           = lcu;
 
   // Search for merge mode candidates
@@ -1104,7 +1112,8 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state,
   info.height   = height_cu;
   info.mvd_cost_func =
     cfg->mv_rdo ? uvg_calc_ibc_mvd_cost_cabac : calc_ibc_mvd_cost;
-  info.optimized_sad  = uvg_get_optimized_sad(width_cu);
+  info.optimized_sad_y  = uvg_get_optimized_sad(width_cu);
+  info.optimized_sad_uv = uvg_get_optimized_sad(cu_loc->chroma_width);
   info.lcu            = lcu;
 
   // Search for merge mode candidates
diff --git a/src/strategies/strategies-picture.c b/src/strategies/strategies-picture.c
index 643d2f8f..649af2d6 100644
--- a/src/strategies/strategies-picture.c
+++ b/src/strategies/strategies-picture.c
@@ -37,6 +37,7 @@
 #include "strategies/generic/picture-generic.h"
 #include "strategies/sse2/picture-sse2.h"
 #include "strategies/sse41/picture-sse41.h"
+#include "strategies/sse42/picture-sse42.h"
 #include "strategyselector.h"
 
 

From 73442f1bba8791a4b3651b56efae01c58fba3f86 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 5 Apr 2023 11:17:16 +0300
Subject: [PATCH 205/254] [depquant] AoS -> SoA for Decision

---
 src/dep_quant.c | 221 ++++++++++++++++++++++++++----------------------
 1 file changed, 119 insertions(+), 102 deletions(-)

diff --git a/src/dep_quant.c b/src/dep_quant.c
index 8cb01860..86968e50 100644
--- a/src/dep_quant.c
+++ b/src/dep_quant.c
@@ -95,9 +95,9 @@ typedef struct
 
 typedef struct
 {
-  int64_t rdCost;
-  coeff_t absLevel;
-  int prevId;
+  int64_t rdCost[8];
+  coeff_t absLevel[8];
+  int prevId[8];
 } Decision;
 
 
@@ -152,7 +152,7 @@ typedef struct
     depquant_state* m_skipStates;
     depquant_state       m_startState;
     quant_block   m_quant;
-    Decision    m_trellis[TR_MAX_WIDTH * TR_MAX_WIDTH][8];
+    Decision    m_trellis[TR_MAX_WIDTH * TR_MAX_WIDTH];
 } context_store;
 
 
@@ -526,15 +526,22 @@ static void depquant_state_init(depquant_state* state, uint32_t sig_frac_bits[2]
   state->m_sbbFracBits[1] = 0;
 }
 
-static INLINE void checkRdCostSkipSbbZeroOut(Decision *decision, const depquant_state * const state) 
+static INLINE void checkRdCostSkipSbbZeroOut(Decision * decision, const depquant_state * const state, int decision_id) 
 {
     int64_t rdCost = state->m_rdCost + state->m_sbbFracBits[0];
-    decision->rdCost = rdCost;
-    decision->absLevel = 0;
-    decision->prevId = 4 + state->m_stateId;
+    decision->rdCost[decision_id] = rdCost;
+    decision->absLevel[decision_id] = 0;
+    decision->prevId[decision_id] = 4 + state->m_stateId;
 }
 
-static void checkRdCosts(const depquant_state * const state, const enum ScanPosType spt, const PQData *pqDataA, const PQData *pqDataB, Decision *decisionA, Decision *decisionB)
+static void checkRdCosts(
+  const depquant_state * const state, 
+  const enum ScanPosType spt, 
+  const PQData *pqDataA,
+  const PQData *pqDataB, 
+  Decision *decisions, 
+  int decisionA,
+  int decisionB)
 {
     const int32_t* goRiceTab = g_goRiceBits[state->m_goRicePar];
     int64_t         rdCostA = state->m_rdCost + pqDataA->deltaDist;
@@ -582,7 +589,7 @@ static void checkRdCosts(const depquant_state * const state, const enum ScanPosT
         }
         else
         {
-            rdCostZ = decisionA->rdCost;
+            rdCostZ = decisions->rdCost[decisionA];
         }
     }
     else
@@ -597,38 +604,39 @@ static void checkRdCosts(const depquant_state * const state, const enum ScanPosT
             : (pqDataB->absLevel < RICEMAX ? pqDataB->absLevel : RICEMAX - 1)];
         rdCostZ += goRiceTab[state->m_goRiceZero];
     }
-    if (rdCostA < decisionA->rdCost)
+    if (rdCostA < decisions->rdCost[decisionA])
     {
-        decisionA->rdCost = rdCostA;
-        decisionA->absLevel = pqDataA->absLevel;
-        decisionA->prevId = state->m_stateId;
+        decisions->rdCost[decisionA] = rdCostA;
+        decisions->absLevel[decisionA] = pqDataA->absLevel;
+        decisions->prevId[decisionA] = state->m_stateId;
     }
-    if (rdCostZ < decisionA->rdCost)
+    if (rdCostZ < decisions->rdCost[decisionA])
     {
-        decisionA->rdCost = rdCostZ;
-        decisionA->absLevel = 0;
-        decisionA->prevId = state->m_stateId;
+        decisions->rdCost[decisionA] = rdCostZ;
+        decisions->absLevel[decisionA] = 0;
+        decisions->prevId[decisionA] = state->m_stateId;
     }
-    if (rdCostB < decisionB->rdCost)
+    if (rdCostB < decisions->rdCost[decisionB])
     {
-        decisionB->rdCost = rdCostB;
-        decisionB->absLevel = pqDataB->absLevel;
-        decisionB->prevId = state->m_stateId;
+        decisions->rdCost[decisionB] = rdCostB;
+        decisions->absLevel[decisionB] = pqDataB->absLevel;
+        decisions->prevId[decisionB] = state->m_stateId;
     }
 }
 
-static INLINE void checkRdCostSkipSbb(const depquant_state* const state, Decision *decision)
+static INLINE void checkRdCostSkipSbb(const depquant_state* const state, Decision * decisions, int decision_id)
 {
     int64_t rdCost = state->m_rdCost + state->m_sbbFracBits[0];
-    if (rdCost < decision->rdCost)
+    if (rdCost < decisions->rdCost[decision_id])
     {
-        decision->rdCost = rdCost;
-        decision->absLevel = 0;
-        decision->prevId = 4 + state->m_stateId;
+        decisions->rdCost[decision_id] = rdCost;
+        decisions->absLevel[decision_id] = 0;
+        decisions->prevId[decision_id] = 4 + state->m_stateId;
     }
 }
 
-static INLINE void checkRdCostStart(const depquant_state* const state, int32_t lastOffset, const PQData *pqData, Decision *decision)
+static INLINE void checkRdCostStart(const depquant_state* const state, int32_t lastOffset, const PQData *pqData, Decision *decisions, int
+                                    decision_id)
 {
     int64_t rdCost = pqData->deltaDist + lastOffset;
     if (pqData->absLevel < 4)
@@ -640,11 +648,11 @@ static INLINE void checkRdCostStart(const depquant_state* const state, int32_t l
         const coeff_t value = (pqData->absLevel - 4) >> 1;
         rdCost += state->m_coeffFracBits[pqData->absLevel - (value << 1)] + g_goRiceBits[state->m_goRicePar][value < RICEMAX ? value : RICEMAX - 1];
     }
-    if (rdCost < decision->rdCost)
+    if (rdCost < decisions->rdCost[decision_id])
     {
-        decision->rdCost = rdCost;
-        decision->absLevel = pqData->absLevel;
-        decision->prevId = -1;
+        decisions->rdCost[decision_id] = rdCost;
+        decisions->absLevel[decision_id] = pqData->absLevel;
+        decisions->prevId[decision_id] = -1;
     }
 }
 
@@ -672,9 +680,8 @@ static INLINE void preQuantCoeff(const quant_block * const qp, const coeff_t abs
 }
 
 
-#define DINIT(l,p) {INT64_MAX>>2,(l),(p)}
-static const Decision startDec[8] = { DINIT(-1,-2),DINIT(-1,-2),DINIT(-1,-2),DINIT(-1,-2),DINIT(0,4),DINIT(0,5),DINIT(0,6),DINIT(0,7) };
-#undef  DINIT
+static const Decision startDec = { .rdCost = {INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2},
+  .absLevel = {-1, -1, -1, -1, 0, 0, 0, 0}, .prevId = {-2, -2, -2, -2, 4, 5, 6, 7} };
 
 
 static void xDecide(
@@ -689,36 +696,36 @@ static void xDecide(
     bool zeroOut,
     coeff_t quanCoeff)
 {
-    memcpy(decisions, startDec, 8 * sizeof(Decision));
+    memcpy(decisions, &startDec, sizeof(Decision));
 
     if (zeroOut)
     {
         if (spt == SCAN_EOCSBB)
         {
-            checkRdCostSkipSbbZeroOut(&decisions[0], &m_skipStates[0]);
-            checkRdCostSkipSbbZeroOut(&decisions[1], &m_skipStates[1]);
-            checkRdCostSkipSbbZeroOut(&decisions[2], &m_skipStates[2]);
-            checkRdCostSkipSbbZeroOut(&decisions[3], &m_skipStates[3]);
+            checkRdCostSkipSbbZeroOut(decisions, &m_skipStates[0], 0);
+            checkRdCostSkipSbbZeroOut(decisions, &m_skipStates[1], 1);
+            checkRdCostSkipSbbZeroOut(decisions, &m_skipStates[2],2);
+            checkRdCostSkipSbbZeroOut(decisions, &m_skipStates[3],3);
         }
         return;
     }
 
     PQData  pqData[4];
     preQuantCoeff(qp, absCoeff, pqData, quanCoeff);
-    checkRdCosts(&m_prevStates[0], spt, &pqData[0], &pqData[2], &decisions[0], &decisions[2]);
-    checkRdCosts(&m_prevStates[1], spt, &pqData[0], &pqData[2], &decisions[2], &decisions[0]);
-    checkRdCosts(&m_prevStates[2], spt, &pqData[3], &pqData[1], &decisions[1], &decisions[3]);
-    checkRdCosts(&m_prevStates[3], spt, &pqData[3], &pqData[1], &decisions[3], &decisions[1]);
+    checkRdCosts(&m_prevStates[0], spt, &pqData[0], &pqData[2], decisions, 0, 2);
+    checkRdCosts(&m_prevStates[1], spt, &pqData[0], &pqData[2], decisions,2, 0);
+    checkRdCosts(&m_prevStates[2], spt, &pqData[3], &pqData[1], decisions, 1,3);
+    checkRdCosts(&m_prevStates[3], spt, &pqData[3], &pqData[1], decisions, 3,1);
     if (spt == SCAN_EOCSBB)
     {
-        checkRdCostSkipSbb(&m_skipStates[0], &decisions[0]);
-        checkRdCostSkipSbb(&m_skipStates[1], &decisions[1]);
-        checkRdCostSkipSbb(&m_skipStates[2], &decisions[2]);
-        checkRdCostSkipSbb(&m_skipStates[3], &decisions[3]);
+        checkRdCostSkipSbb(&m_skipStates[0], decisions, 0);
+        checkRdCostSkipSbb(&m_skipStates[1], decisions, 1);
+        checkRdCostSkipSbb(&m_skipStates[2], decisions,2);
+        checkRdCostSkipSbb(&m_skipStates[3], decisions,3);
     }
 
-    checkRdCostStart(m_startState, lastOffset, &pqData[0], &decisions[0]);
-    checkRdCostStart(m_startState, lastOffset, &pqData[2], &decisions[2]);
+    checkRdCostStart(m_startState, lastOffset, &pqData[0], decisions, 0);
+    checkRdCostStart(m_startState, lastOffset, &pqData[2], decisions, 2);
 }
 
 
@@ -823,33 +830,34 @@ static INLINE void update_common_context(
 
 
 static INLINE void updateStateEOS(
-  depquant_state * state, 
-  const uint32_t  scan_pos,
-  const uint32_t  cg_pos,
-  const uint32_t sigCtxOffsetNext,
-  const uint32_t gtxCtxOffsetNext,
-  const uint32_t width_in_sbb,
-  const uint32_t height_in_sbb,
-  const uint32_t next_sbb_right,
-  const uint32_t next_sbb_below,
+  depquant_state *      state,
+  const uint32_t        scan_pos,
+  const uint32_t        cg_pos,
+  const uint32_t        sigCtxOffsetNext,
+  const uint32_t        gtxCtxOffsetNext,
+  const uint32_t        width_in_sbb,
+  const uint32_t        height_in_sbb,
+  const uint32_t        next_sbb_right,
+  const uint32_t        next_sbb_below,
   const depquant_state* prevStates,
   const depquant_state* skipStates,
-  const Decision *decision)
+  const Decision *      decisions,
+  int                   decision_id)
 {
-    state->m_rdCost = decision->rdCost;
-    if (decision->prevId > -2)
+    state->m_rdCost = decisions->rdCost[decision_id];
+    if (decisions->prevId[decision_id] > -2)
     {
       const depquant_state* prvState = 0;
-      if (decision->prevId >= 4)
+      if (decisions->prevId[decision_id] >= 4)
       {
-          prvState = skipStates + (decision->prevId - 4);
+          prvState = skipStates + (decisions->prevId[decision_id] - 4);
           state->m_numSigSbb = 0;
           memset(state->m_absLevelsAndCtxInit, 0, 16 * sizeof(uint8_t));
       }
-      else if (decision->prevId >= 0)
+      else if (decisions->prevId[decision_id] >= 0)
       {
-          prvState = prevStates + decision->prevId;
-          state->m_numSigSbb = prvState->m_numSigSbb + !!decision->absLevel;
+          prvState = prevStates + decisions->prevId[decision_id];
+          state->m_numSigSbb = prvState->m_numSigSbb + !!decisions->absLevel[decision_id];
           memcpy(state->m_absLevelsAndCtxInit, prvState->m_absLevelsAndCtxInit, 16 * sizeof(uint8_t));
       }
       else
@@ -858,7 +866,7 @@ static INLINE void updateStateEOS(
           memset(state->m_absLevelsAndCtxInit, 0, 16 * sizeof(uint8_t));
       }
       uint8_t* temp = (uint8_t*)(&state->m_absLevelsAndCtxInit[scan_pos & 15]);
-      *temp = (uint8_t)MIN(255, decision->absLevel);
+      *temp = (uint8_t)MIN(255, decisions->absLevel[decision_id]);
 
       update_common_context(state->m_commonCtx, scan_pos, cg_pos, width_in_sbb, height_in_sbb, next_sbb_right, next_sbb_below,prvState, state);
       
@@ -874,22 +882,24 @@ static INLINE void updateStateEOS(
 }
 
 static INLINE void updateState(
-  depquant_state* state, 
-  int numIPos, const uint32_t scan_pos,
+  depquant_state*       state,
+  int                   numIPos,
+  const uint32_t        scan_pos,
   const depquant_state* prevStates,
-  const Decision* decision,
-  const uint32_t sigCtxOffsetNext,
-  const uint32_t gtxCtxOffsetNext,
-  const NbInfoSbb next_nb_info_ssb,
-  const int baseLevel,
-  const bool extRiceFlag) {
-    state->m_rdCost = decision->rdCost;
-    if (decision->prevId > -2)
+  const Decision*       decisions,
+  const uint32_t        sigCtxOffsetNext,
+  const uint32_t        gtxCtxOffsetNext,
+  const NbInfoSbb       next_nb_info_ssb,
+  const int             baseLevel,
+  const bool            extRiceFlag,
+  int                   decision_id) {
+    state->m_rdCost = decisions->rdCost[decision_id];
+    if (decisions->prevId[decision_id] > -2)
     {
-        if (decision->prevId >= 0)
+        if (decisions->prevId[decision_id] >= 0)
         {
-            const depquant_state* prvState = prevStates + decision->prevId;
-            state->m_numSigSbb = prvState->m_numSigSbb + !!decision->absLevel;
+            const depquant_state* prvState = prevStates + decisions->prevId[decision_id];
+            state->m_numSigSbb = prvState->m_numSigSbb + !!decisions->absLevel[decision_id];
             state->m_refSbbCtxId = prvState->m_refSbbCtxId;
             state->m_sbbFracBits[0] = prvState->m_sbbFracBits[0];
             state->m_sbbFracBits[1] = prvState->m_sbbFracBits[1];
@@ -897,7 +907,7 @@ static INLINE void updateState(
             state->m_goRicePar = prvState->m_goRicePar;
             if (state->m_remRegBins >= 4)
             {
-                state->m_remRegBins -= (decision->absLevel < 2 ? (unsigned)decision->absLevel : 3);
+                state->m_remRegBins -= (decisions->absLevel[decision_id] < 2 ? (unsigned)decisions->absLevel[decision_id] : 3);
             }
             memcpy(state->m_absLevelsAndCtxInit, prvState->m_absLevelsAndCtxInit, 48 * sizeof(uint8_t));
         }
@@ -906,12 +916,12 @@ static INLINE void updateState(
             state->m_numSigSbb = 1;
             state->m_refSbbCtxId = -1;
             int ctxBinSampleRatio = 28; //(scanInfo.chType == CHANNEL_TYPE_LUMA) ? MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_LUMA : MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_CHROMA;
-            state->m_remRegBins = (state->effWidth * state->effHeight * ctxBinSampleRatio) / 16 - (decision->absLevel < 2 ? (unsigned)decision->absLevel : 3);
+            state->m_remRegBins = (state->effWidth * state->effHeight * ctxBinSampleRatio) / 16 - (decisions->absLevel[decision_id] < 2 ? (unsigned)decisions->absLevel[decision_id] : 3);
             memset(state->m_absLevelsAndCtxInit, 0, 48 * sizeof(uint8_t));
         }
 
         uint8_t* levels = (uint8_t*)(state->m_absLevelsAndCtxInit);
-        levels[scan_pos & 15] = (uint8_t)MIN(255, decision->absLevel);
+        levels[scan_pos & 15] = (uint8_t)MIN(255, decisions->absLevel[decision_id]);
 
         if (state->m_remRegBins >= 4)
         {
@@ -1076,7 +1086,7 @@ static void xDecideAndUpdate(
   int effWidth,
   int effHeight)
 {
-  Decision* decisions = ctxs->m_trellis[scan_pos];
+  Decision* decisions = &ctxs->m_trellis[scan_pos];
   SWAP(ctxs->m_currStates, ctxs->m_prevStates, depquant_state*);
 
   enum ScanPosType spt = 0;
@@ -1094,17 +1104,19 @@ static void xDecideAndUpdate(
   if (scan_pos) {
     if (!(scan_pos & 15)) {
       SWAP(ctxs->m_common_context.m_currSbbCtx, ctxs->m_common_context.m_prevSbbCtx, SbbCtx*);
-      updateStateEOS(&ctxs->m_currStates[0], scan_pos, cg_pos, sigCtxOffsetNext, gtxCtxOffsetNext, width_in_sbb, height_in_sbb, next_sbb_right, next_sbb_below, ctxs->m_prevStates, ctxs->m_skipStates, &decisions[0]);
-      updateStateEOS(&ctxs->m_currStates[1], scan_pos, cg_pos, sigCtxOffsetNext, gtxCtxOffsetNext, width_in_sbb, height_in_sbb, next_sbb_right, next_sbb_below, ctxs->m_prevStates, ctxs->m_skipStates, &decisions[1]);
-      updateStateEOS(&ctxs->m_currStates[2], scan_pos, cg_pos, sigCtxOffsetNext, gtxCtxOffsetNext, width_in_sbb, height_in_sbb, next_sbb_right, next_sbb_below, ctxs->m_prevStates, ctxs->m_skipStates, &decisions[2]);
-      updateStateEOS(&ctxs->m_currStates[3], scan_pos, cg_pos, sigCtxOffsetNext, gtxCtxOffsetNext, width_in_sbb, height_in_sbb, next_sbb_right, next_sbb_below, ctxs->m_prevStates, ctxs->m_skipStates, &decisions[3]);
-      memcpy(decisions + 4, decisions, 4 * sizeof(Decision));
+      updateStateEOS(&ctxs->m_currStates[0], scan_pos, cg_pos, sigCtxOffsetNext, gtxCtxOffsetNext, width_in_sbb, height_in_sbb, next_sbb_right, next_sbb_below, ctxs->m_prevStates, ctxs->m_skipStates, decisions,0);
+      updateStateEOS(&ctxs->m_currStates[1], scan_pos, cg_pos, sigCtxOffsetNext, gtxCtxOffsetNext, width_in_sbb, height_in_sbb, next_sbb_right, next_sbb_below, ctxs->m_prevStates, ctxs->m_skipStates, decisions,1);
+      updateStateEOS(&ctxs->m_currStates[2], scan_pos, cg_pos, sigCtxOffsetNext, gtxCtxOffsetNext, width_in_sbb, height_in_sbb, next_sbb_right, next_sbb_below, ctxs->m_prevStates, ctxs->m_skipStates, decisions,2);
+      updateStateEOS(&ctxs->m_currStates[3], scan_pos, cg_pos, sigCtxOffsetNext, gtxCtxOffsetNext, width_in_sbb, height_in_sbb, next_sbb_right, next_sbb_below, ctxs->m_prevStates, ctxs->m_skipStates, decisions,3);
+      memcpy(decisions->prevId + 4, decisions->prevId, 4 * sizeof(int));
+      memcpy(decisions->absLevel + 4, decisions->absLevel, 4 * sizeof(coeff_t));
+      memcpy(decisions->rdCost + 4, decisions->rdCost, 4 * sizeof(int64_t));
     } else if (!zeroOut) {
 
-      updateState(&ctxs->m_currStates[0], next_nb_info_ssb.num, scan_pos, ctxs->m_prevStates, &decisions[0], sigCtxOffsetNext, gtxCtxOffsetNext, next_nb_info_ssb, 4, false);
-      updateState(&ctxs->m_currStates[1], next_nb_info_ssb.num, scan_pos, ctxs->m_prevStates, &decisions[1], sigCtxOffsetNext, gtxCtxOffsetNext, next_nb_info_ssb, 4, false);
-      updateState(&ctxs->m_currStates[2], next_nb_info_ssb.num, scan_pos, ctxs->m_prevStates, &decisions[2], sigCtxOffsetNext, gtxCtxOffsetNext, next_nb_info_ssb, 4, false);
-      updateState(&ctxs->m_currStates[3], next_nb_info_ssb.num, scan_pos, ctxs->m_prevStates, &decisions[3], sigCtxOffsetNext, gtxCtxOffsetNext, next_nb_info_ssb, 4, false);
+      updateState(&ctxs->m_currStates[0], next_nb_info_ssb.num, scan_pos, ctxs->m_prevStates, decisions, sigCtxOffsetNext, gtxCtxOffsetNext, next_nb_info_ssb, 4, false, 0);
+      updateState(&ctxs->m_currStates[1], next_nb_info_ssb.num, scan_pos, ctxs->m_prevStates, decisions, sigCtxOffsetNext, gtxCtxOffsetNext, next_nb_info_ssb, 4, false, 1);
+      updateState(&ctxs->m_currStates[2], next_nb_info_ssb.num, scan_pos, ctxs->m_prevStates, decisions, sigCtxOffsetNext, gtxCtxOffsetNext, next_nb_info_ssb, 4, false, 2);
+      updateState(&ctxs->m_currStates[3], next_nb_info_ssb.num, scan_pos, ctxs->m_prevStates, decisions, sigCtxOffsetNext, gtxCtxOffsetNext, next_nb_info_ssb, 4, false, 3);
     }
 
     if (spt == SCAN_SOCSBB) {
@@ -1308,27 +1320,32 @@ int uvg_dep_quant(
         width,
         height); //tu.cu->slice->getReverseLastSigCoeffFlag());
     }
-
+    //printf("%d\n", scanIdx);
+    //for(int i = 0; i < 4; i++) {
+    //  printf("%lld %hu %d\n", ctxs->m_trellis[scanIdx].rdCost[i], ctxs->m_trellis[scanIdx].absLevel[i], ctxs->m_trellis[scanIdx].prevId[i]);
+    //}
+    //printf("\n");
   }
 
   //===== find best path =====
-  Decision decision    = {INT64_MAX, -1, -2};
+  int prev_id    = -1;
   int64_t  minPathCost = 0;
   for (int8_t stateId = 0; stateId < 4; stateId++) {
-    int64_t pathCost = dep_quant_context.m_trellis[0][stateId].rdCost;
+    int64_t pathCost = dep_quant_context.m_trellis[0].rdCost[stateId];
     if (pathCost < minPathCost) {
-      decision.prevId = stateId;
+      prev_id = stateId;
       minPathCost     = pathCost;
     }
   }
 
   //===== backward scanning =====
   int scanIdx = 0;
-  for (; decision.prevId >= 0; scanIdx++) {
-    decision       = dep_quant_context.m_trellis[scanIdx][decision.prevId];
+  for (; prev_id >= 0; scanIdx++) {
+    Decision temp       = dep_quant_context.m_trellis[scanIdx];
     int32_t blkpos = scan[scanIdx];
-    coeff_out[blkpos] = (srcCoeff[blkpos] < 0 ? -decision.absLevel : decision.absLevel);
-    *absSum += decision.absLevel;
+    coeff_out[blkpos] = (srcCoeff[blkpos] < 0 ? -temp.absLevel[prev_id] : temp.absLevel[prev_id]);
+    *absSum += temp.absLevel[prev_id];
+    prev_id = temp.prevId[prev_id];
   }
   return *absSum;
 }

From 2f1e9c402054e628d860b3fd836a4bffde4fe8eb Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Thu, 6 Apr 2023 15:12:29 +0300
Subject: [PATCH 206/254] [depquant] AoS -> SoA all states

---
 src/dep_quant.c | 432 +++++++++++++++++++++++++++---------------------
 1 file changed, 239 insertions(+), 193 deletions(-)

diff --git a/src/dep_quant.c b/src/dep_quant.c
index 86968e50..b6c246ea 100644
--- a/src/dep_quant.c
+++ b/src/dep_quant.c
@@ -143,13 +143,34 @@ typedef struct
   unsigned effHeight;
 } depquant_state;
 
+typedef struct
+{
+  int64_t m_rdCost[12];
+  uint16_t m_absLevelsAndCtxInit[12][24]; // 16x8bit for abs levels + 16x16bit for ctx init id
+  int8_t m_numSigSbb[12];
+  int m_remRegBins[12];
+  int8_t m_refSbbCtxId[12];
+  uint32_t m_sbbFracBits[12][2];
+  uint32_t m_sigFracBits[12][2];
+  int32_t m_coeffFracBits[12][6];
+  int8_t m_goRicePar[12];
+  int8_t m_goRiceZero[12];
+  int8_t m_stateId[12];
+  uint32_t *m_sigFracBitsArray[12][12];
+  int32_t *m_gtxFracBitsArray[21];
+  common_context* m_commonCtx;
+
+  unsigned effWidth;
+  unsigned effHeight;
+} all_depquant_states;
+
 typedef struct
 {
     common_context   m_common_context;
-    depquant_state       m_allStates[12];
-    depquant_state* m_currStates;
-    depquant_state* m_prevStates;
-    depquant_state* m_skipStates;
+    all_depquant_states m_allStates;
+    int m_curr_state_offset;
+    int m_prev_state_offset;
+    int m_skip_state_offset;
     depquant_state       m_startState;
     quant_block   m_quant;
     Decision    m_trellis[TR_MAX_WIDTH * TR_MAX_WIDTH];
@@ -526,66 +547,66 @@ static void depquant_state_init(depquant_state* state, uint32_t sig_frac_bits[2]
   state->m_sbbFracBits[1] = 0;
 }
 
-static INLINE void checkRdCostSkipSbbZeroOut(Decision * decision, const depquant_state * const state, int decision_id) 
-{
-    int64_t rdCost = state->m_rdCost + state->m_sbbFracBits[0];
-    decision->rdCost[decision_id] = rdCost;
+static INLINE void checkRdCostSkipSbbZeroOut(Decision * decision, const all_depquant_states * const state, int decision_id, int skip_offset) {  
+    int64_t rdCost                    = state->m_rdCost[decision_id + skip_offset] + state->m_sbbFracBits[decision_id + skip_offset][0];
+    decision->rdCost[decision_id]   = rdCost;
     decision->absLevel[decision_id] = 0;
-    decision->prevId[decision_id] = 4 + state->m_stateId;
+    decision->prevId[decision_id]   = 4 + state->m_stateId[decision_id + skip_offset];
 }
 
 static void checkRdCosts(
-  const depquant_state * const state, 
-  const enum ScanPosType spt, 
-  const PQData *pqDataA,
-  const PQData *pqDataB, 
-  Decision *decisions, 
-  int decisionA,
-  int decisionB)
+  const all_depquant_states * const state,
+  const enum ScanPosType            spt,
+  const PQData *                    pqDataA,
+  const PQData *                    pqDataB,
+  Decision *                        decisions,
+  const int                         decisionA,
+  const int                         decisionB,
+  const int                         state_offset)
 {
-    const int32_t* goRiceTab = g_goRiceBits[state->m_goRicePar];
-    int64_t         rdCostA = state->m_rdCost + pqDataA->deltaDist;
-    int64_t         rdCostB = state->m_rdCost + pqDataB->deltaDist;
-    int64_t         rdCostZ = state->m_rdCost;
-    if (state->m_remRegBins >= 4)
+    const int32_t* goRiceTab = g_goRiceBits[state->m_goRicePar[state_offset]];
+    int64_t         rdCostA = state->m_rdCost[state_offset] + pqDataA->deltaDist;
+    int64_t         rdCostB = state->m_rdCost[state_offset] + pqDataB->deltaDist;
+    int64_t         rdCostZ = state->m_rdCost[state_offset];
+    if (state->m_remRegBins[state_offset] >= 4)
     {
         if (pqDataA->absLevel < 4)
         {
-            rdCostA += state->m_coeffFracBits[pqDataA->absLevel];
+            rdCostA += state->m_coeffFracBits[state_offset][pqDataA->absLevel];
         }
         else
         {
             const coeff_t value = (pqDataA->absLevel - 4) >> 1;
             rdCostA +=
-                state->m_coeffFracBits[pqDataA->absLevel - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1];
+                state->m_coeffFracBits[state_offset][pqDataA->absLevel - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1];
         }
         if (pqDataB->absLevel < 4)
         {
-            rdCostB += state->m_coeffFracBits[pqDataB->absLevel];
+            rdCostB += state->m_coeffFracBits[state_offset][pqDataB->absLevel];
         }
         else
         {
             const coeff_t value = (pqDataB->absLevel - 4) >> 1;
             rdCostB +=
-                state->m_coeffFracBits[pqDataB->absLevel - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1];
+                state->m_coeffFracBits[state_offset][pqDataB->absLevel - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1];
         }
         if (spt == SCAN_ISCSBB)
         {
-            rdCostA += state->m_sigFracBits[1];
-            rdCostB += state->m_sigFracBits[1];
-            rdCostZ += state->m_sigFracBits[0];
+            rdCostA += state->m_sigFracBits[state_offset][1];
+            rdCostB += state->m_sigFracBits[state_offset][1];
+            rdCostZ += state->m_sigFracBits[state_offset][0];
         }
         else if (spt == SCAN_SOCSBB)
         {
-            rdCostA += state->m_sbbFracBits[1] + state->m_sigFracBits[1];
-            rdCostB += state->m_sbbFracBits[1] + state->m_sigFracBits[1];
-            rdCostZ += state->m_sbbFracBits[1] + state->m_sigFracBits[0];
+            rdCostA += state->m_sbbFracBits[state_offset][1] + state->m_sigFracBits[state_offset][1];
+            rdCostB += state->m_sbbFracBits[state_offset][1] + state->m_sigFracBits[state_offset][1];
+            rdCostZ += state->m_sbbFracBits[state_offset][1] + state->m_sigFracBits[state_offset][0];
         }
-        else if (state->m_numSigSbb)
+        else if (state->m_numSigSbb[state_offset])
         {
-            rdCostA += state->m_sigFracBits[1];
-            rdCostB += state->m_sigFracBits[1];
-            rdCostZ += state->m_sigFracBits[0];
+            rdCostA += state->m_sigFracBits[state_offset][1];
+            rdCostB += state->m_sigFracBits[state_offset][1];
+            rdCostZ += state->m_sigFracBits[state_offset][0];
         }
         else
         {
@@ -595,43 +616,43 @@ static void checkRdCosts(
     else
     {
         rdCostA +=
-            (1 << SCALE_BITS)
-            + goRiceTab[pqDataA->absLevel <= state->m_goRiceZero ? pqDataA->absLevel - 1
+            (1 << SCALE_BITS) + goRiceTab[pqDataA->absLevel <= state->m_goRiceZero[state_offset] 
+            ? pqDataA->absLevel - 1
             : (pqDataA->absLevel < RICEMAX ? pqDataA->absLevel : RICEMAX - 1)];
         rdCostB +=
-            (1 << SCALE_BITS)
-            + goRiceTab[pqDataB->absLevel <= state->m_goRiceZero ? pqDataB->absLevel - 1
+            (1 << SCALE_BITS) + goRiceTab[pqDataB->absLevel <= state->m_goRiceZero[state_offset]
+            ? pqDataB->absLevel - 1
             : (pqDataB->absLevel < RICEMAX ? pqDataB->absLevel : RICEMAX - 1)];
-        rdCostZ += goRiceTab[state->m_goRiceZero];
+        rdCostZ += goRiceTab[state->m_goRiceZero[state_offset]];
     }
     if (rdCostA < decisions->rdCost[decisionA])
     {
         decisions->rdCost[decisionA] = rdCostA;
         decisions->absLevel[decisionA] = pqDataA->absLevel;
-        decisions->prevId[decisionA] = state->m_stateId;
+        decisions->prevId[decisionA] = state->m_stateId[state_offset];
     }
     if (rdCostZ < decisions->rdCost[decisionA])
     {
         decisions->rdCost[decisionA] = rdCostZ;
         decisions->absLevel[decisionA] = 0;
-        decisions->prevId[decisionA] = state->m_stateId;
+        decisions->prevId[decisionA] = state->m_stateId[state_offset];
     }
     if (rdCostB < decisions->rdCost[decisionB])
     {
         decisions->rdCost[decisionB] = rdCostB;
         decisions->absLevel[decisionB] = pqDataB->absLevel;
-        decisions->prevId[decisionB] = state->m_stateId;
+        decisions->prevId[decisionB] = state->m_stateId[state_offset];
     }
 }
 
-static INLINE void checkRdCostSkipSbb(const depquant_state* const state, Decision * decisions, int decision_id)
+static INLINE void checkRdCostSkipSbb(const all_depquant_states* const state, Decision * decisions, int decision_id, int skip_offset)
 {
-    int64_t rdCost = state->m_rdCost + state->m_sbbFracBits[0];
+    int64_t rdCost = state->m_rdCost[skip_offset + decision_id] + state->m_sbbFracBits[skip_offset + decision_id][0];
     if (rdCost < decisions->rdCost[decision_id])
     {
         decisions->rdCost[decision_id] = rdCost;
         decisions->absLevel[decision_id] = 0;
-        decisions->prevId[decision_id] = 4 + state->m_stateId;
+        decisions->prevId[decision_id] = 4 + state->m_stateId[skip_offset + decision_id];
     }
 }
 
@@ -685,16 +706,17 @@ static const Decision startDec = { .rdCost = {INT64_MAX >> 2, INT64_MAX >> 2, IN
 
 
 static void xDecide(
-    depquant_state* const m_skipStates,
-    depquant_state* const m_prevStates,
-    depquant_state* const m_startState,
-    quant_block *qp,
-    const enum ScanPosType spt,
-    const coeff_t absCoeff,
-    const int lastOffset,
-    Decision* decisions,
-    bool zeroOut,
-    coeff_t quanCoeff)
+  all_depquant_states* const all_states,
+  depquant_state* const      m_startState,
+  quant_block *              qp,
+  const enum ScanPosType     spt,
+  const coeff_t              absCoeff,
+  const int                  lastOffset,
+  Decision*                  decisions,
+  bool                       zeroOut,
+  coeff_t                    quanCoeff,
+  const int                  skip_offset,
+  const int                  prev_offset)
 {
     memcpy(decisions, &startDec, sizeof(Decision));
 
@@ -702,26 +724,26 @@ static void xDecide(
     {
         if (spt == SCAN_EOCSBB)
         {
-            checkRdCostSkipSbbZeroOut(decisions, &m_skipStates[0], 0);
-            checkRdCostSkipSbbZeroOut(decisions, &m_skipStates[1], 1);
-            checkRdCostSkipSbbZeroOut(decisions, &m_skipStates[2],2);
-            checkRdCostSkipSbbZeroOut(decisions, &m_skipStates[3],3);
+            checkRdCostSkipSbbZeroOut(decisions, all_states, 0, skip_offset);
+            checkRdCostSkipSbbZeroOut(decisions, all_states, 1, skip_offset);
+            checkRdCostSkipSbbZeroOut(decisions, all_states,2, skip_offset);
+            checkRdCostSkipSbbZeroOut(decisions, all_states,3, skip_offset);
         }
         return;
     }
 
     PQData  pqData[4];
     preQuantCoeff(qp, absCoeff, pqData, quanCoeff);
-    checkRdCosts(&m_prevStates[0], spt, &pqData[0], &pqData[2], decisions, 0, 2);
-    checkRdCosts(&m_prevStates[1], spt, &pqData[0], &pqData[2], decisions,2, 0);
-    checkRdCosts(&m_prevStates[2], spt, &pqData[3], &pqData[1], decisions, 1,3);
-    checkRdCosts(&m_prevStates[3], spt, &pqData[3], &pqData[1], decisions, 3,1);
+    checkRdCosts(all_states, spt, &pqData[0], &pqData[2], decisions, 0, 2, prev_offset + 0);
+    checkRdCosts(all_states, spt, &pqData[0], &pqData[2], decisions, 2, 0, prev_offset + 1);
+    checkRdCosts(all_states, spt, &pqData[3], &pqData[1], decisions, 1, 3, prev_offset + 2);
+    checkRdCosts(all_states, spt, &pqData[3], &pqData[1], decisions, 3, 1, prev_offset + 3);
     if (spt == SCAN_EOCSBB)
     {
-        checkRdCostSkipSbb(&m_skipStates[0], decisions, 0);
-        checkRdCostSkipSbb(&m_skipStates[1], decisions, 1);
-        checkRdCostSkipSbb(&m_skipStates[2], decisions,2);
-        checkRdCostSkipSbb(&m_skipStates[3], decisions,3);
+        checkRdCostSkipSbb(all_states, decisions, 0, skip_offset);
+        checkRdCostSkipSbb(all_states, decisions, 1, skip_offset);
+        checkRdCostSkipSbb(all_states, decisions,2, skip_offset);
+        checkRdCostSkipSbb(all_states, decisions,3, skip_offset);
     }
 
     checkRdCostStart(m_startState, lastOffset, &pqData[0], decisions, 0);
@@ -756,45 +778,46 @@ static INLINE unsigned templateAbsCompare(coeff_t sum)
 }
 
 static INLINE void update_common_context(
+  context_store*   ctxs,
   common_context * cc,
-  const uint32_t scan_pos,
-  const uint32_t cg_pos,
-  const uint32_t width_in_sbb,
-  const uint32_t height_in_sbb,
-  const uint32_t next_sbb_right,
-  const uint32_t next_sbb_below,
-  const depquant_state* prevState,
-  depquant_state *currState)
+  const uint32_t   scan_pos,
+  const uint32_t   cg_pos,
+  const uint32_t   width_in_sbb,
+  const uint32_t   height_in_sbb,
+  const uint32_t   next_sbb_right,
+  const uint32_t   next_sbb_below,
+  const int        prev_state,
+  const int        curr_state)
 {
   const uint32_t numSbb = width_in_sbb * height_in_sbb;
-  uint8_t* sbbFlags = cc->m_currSbbCtx[currState->m_stateId].sbbFlags;
-  uint8_t* levels = cc->m_currSbbCtx[currState->m_stateId].levels;
+  uint8_t* sbbFlags = cc->m_currSbbCtx[curr_state & 3].sbbFlags;
+  uint8_t* levels = cc->m_currSbbCtx[curr_state & 3].levels;
   size_t setCpSize = cc->m_nbInfo[scan_pos - 1].maxDist * sizeof(uint8_t);
-  if (prevState && prevState->m_refSbbCtxId >= 0) {
-    memcpy(sbbFlags, cc->m_prevSbbCtx[prevState->m_refSbbCtxId].sbbFlags, numSbb * sizeof(uint8_t));
-    memcpy(levels + scan_pos, cc->m_prevSbbCtx[prevState->m_refSbbCtxId].levels + scan_pos, setCpSize);
+  if (prev_state != -1 && ctxs->m_allStates.m_refSbbCtxId[prev_state] >= 0) {
+    memcpy(sbbFlags, cc->m_prevSbbCtx[ctxs->m_allStates.m_refSbbCtxId[prev_state]].sbbFlags, numSbb * sizeof(uint8_t));
+    memcpy(levels + scan_pos, cc->m_prevSbbCtx[ctxs->m_allStates.m_refSbbCtxId[prev_state]].levels + scan_pos, setCpSize);
   }
   else {
     memset(sbbFlags, 0, numSbb * sizeof(uint8_t));
     memset(levels + scan_pos, 0, setCpSize);
   }
-  sbbFlags[cg_pos] = !!currState->m_numSigSbb;
-  memcpy(levels + scan_pos, currState->m_absLevelsAndCtxInit, 16 * sizeof(uint8_t));
+  sbbFlags[cg_pos] = !!ctxs->m_allStates.m_numSigSbb[curr_state];
+  memcpy(levels + scan_pos, ctxs->m_allStates.m_absLevelsAndCtxInit[curr_state], 16 * sizeof(uint8_t));
 
   const int       sigNSbb = ((next_sbb_right ? sbbFlags[next_sbb_right] : false) || (next_sbb_below ? sbbFlags[next_sbb_below] : false) ? 1 : 0);
-  currState->m_numSigSbb = 0;
-  if (prevState) {
-    currState->m_remRegBins = prevState->m_remRegBins;
+  ctxs->m_allStates.m_numSigSbb[curr_state] = 0;
+  if (prev_state != -1) {
+    ctxs->m_allStates.m_remRegBins[curr_state] = ctxs->m_allStates.m_remRegBins[prev_state];
   }
   else {
     int ctxBinSampleRatio = 28;
     // (scanInfo.chType == COLOR_Y) ? MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_LUMA : MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_CHROMA;
-    currState->m_remRegBins = (currState->effWidth * currState->effHeight * ctxBinSampleRatio) / 16;
+    ctxs->m_allStates.m_remRegBins[curr_state] = (ctxs->m_allStates.effWidth * ctxs->m_allStates.effHeight * ctxBinSampleRatio) / 16;
   }
-  currState->m_goRicePar = 0;
-  currState->m_refSbbCtxId = currState->m_stateId;
-  currState->m_sbbFracBits[0] = cc->m_sbbFlagBits[sigNSbb][0];
-  currState->m_sbbFracBits[1] = cc->m_sbbFlagBits[sigNSbb][1];
+  ctxs->m_allStates.m_goRicePar[curr_state] = 0;
+  ctxs->m_allStates.m_refSbbCtxId[curr_state] = curr_state & 3;
+  ctxs->m_allStates.m_sbbFracBits[curr_state][0] = cc->m_sbbFlagBits[sigNSbb][0];
+  ctxs->m_allStates.m_sbbFracBits[curr_state][1] = cc->m_sbbFlagBits[sigNSbb][1];
 
   uint16_t templateCtxInit[16];
   const int scanBeg = scan_pos - 16;
@@ -824,108 +847,109 @@ static INLINE void update_common_context(
       templateCtxInit[id] = 0;
     }
   }
-  memset(currState->m_absLevelsAndCtxInit, 0, 16 * sizeof(uint8_t));
-  memcpy(currState->m_absLevelsAndCtxInit + 8, templateCtxInit, 16 * sizeof(uint16_t));
+  memset(ctxs->m_allStates.m_absLevelsAndCtxInit[curr_state], 0, 16 * sizeof(uint8_t));
+  memcpy(ctxs->m_allStates.m_absLevelsAndCtxInit[curr_state] + 8, templateCtxInit, 16 * sizeof(uint16_t));
 }
 
 
 static INLINE void updateStateEOS(
-  depquant_state *      state,
-  const uint32_t        scan_pos,
-  const uint32_t        cg_pos,
-  const uint32_t        sigCtxOffsetNext,
-  const uint32_t        gtxCtxOffsetNext,
-  const uint32_t        width_in_sbb,
-  const uint32_t        height_in_sbb,
-  const uint32_t        next_sbb_right,
-  const uint32_t        next_sbb_below,
-  const depquant_state* prevStates,
-  const depquant_state* skipStates,
-  const Decision *      decisions,
-  int                   decision_id)
+  context_store*   ctxs,
+  const uint32_t   scan_pos,
+  const uint32_t   cg_pos,
+  const uint32_t   sigCtxOffsetNext,
+  const uint32_t   gtxCtxOffsetNext,
+  const uint32_t   width_in_sbb,
+  const uint32_t   height_in_sbb,
+  const uint32_t   next_sbb_right,
+  const uint32_t   next_sbb_below,
+  const Decision * decisions,
+  int              decision_id)
 {
-    state->m_rdCost = decisions->rdCost[decision_id];
+    all_depquant_states* state = &ctxs->m_allStates;
+    int curr_state_offset = ctxs->m_curr_state_offset + decision_id;
+    state->m_rdCost[curr_state_offset] = decisions->rdCost[decision_id];
     if (decisions->prevId[decision_id] > -2)
     {
-      const depquant_state* prvState = 0;
+      int prvState = -1;
       if (decisions->prevId[decision_id] >= 4)
       {
-          prvState = skipStates + (decisions->prevId[decision_id] - 4);
-          state->m_numSigSbb = 0;
-          memset(state->m_absLevelsAndCtxInit, 0, 16 * sizeof(uint8_t));
+          prvState = ctxs->m_skip_state_offset + (decisions->prevId[decision_id] - 4);
+          state->m_numSigSbb[curr_state_offset] = 0;
+          memset(state->m_absLevelsAndCtxInit[curr_state_offset], 0, 16 * sizeof(uint8_t));
       }
       else if (decisions->prevId[decision_id] >= 0)
       {
-          prvState = prevStates + decisions->prevId[decision_id];
-          state->m_numSigSbb = prvState->m_numSigSbb + !!decisions->absLevel[decision_id];
-          memcpy(state->m_absLevelsAndCtxInit, prvState->m_absLevelsAndCtxInit, 16 * sizeof(uint8_t));
+          prvState = ctxs->m_prev_state_offset + decisions->prevId[decision_id];
+          state->m_numSigSbb[curr_state_offset] = state->m_numSigSbb[prvState] + !!decisions->absLevel[decision_id];
+          memcpy(state->m_absLevelsAndCtxInit[curr_state_offset], state->m_absLevelsAndCtxInit[prvState], 16 * sizeof(uint8_t));
       }
       else
       {
-          state->m_numSigSbb = 1;
-          memset(state->m_absLevelsAndCtxInit, 0, 16 * sizeof(uint8_t));
+          state->m_numSigSbb[curr_state_offset] = 1;
+          memset(state->m_absLevelsAndCtxInit[curr_state_offset], 0, 16 * sizeof(uint8_t));
       }
-      uint8_t* temp = (uint8_t*)(&state->m_absLevelsAndCtxInit[scan_pos & 15]);
+      uint8_t* temp = (uint8_t*)(&state->m_absLevelsAndCtxInit[curr_state_offset][scan_pos & 15]);
       *temp = (uint8_t)MIN(255, decisions->absLevel[decision_id]);
 
-      update_common_context(state->m_commonCtx, scan_pos, cg_pos, width_in_sbb, height_in_sbb, next_sbb_right, next_sbb_below,prvState, state);
+      update_common_context(ctxs, state->m_commonCtx, scan_pos, cg_pos, width_in_sbb, height_in_sbb, next_sbb_right,next_sbb_below, prvState, ctxs->m_curr_state_offset + decision_id);
       
-      coeff_t  tinit = state->m_absLevelsAndCtxInit[8 + ((scan_pos - 1) & 15)];
+      coeff_t  tinit = state->m_absLevelsAndCtxInit[curr_state_offset][8 + ((scan_pos - 1) & 15)];
       coeff_t  sumNum = tinit & 7;
       coeff_t  sumAbs1 = (tinit >> 3) & 31;
       coeff_t  sumGt1 = sumAbs1 - sumNum;
-      state->m_sigFracBits[0] = state->m_sigFracBitsArray[sigCtxOffsetNext + MIN((sumAbs1 + 1) >> 1, 3)][0];
-      state->m_sigFracBits[1] = state->m_sigFracBitsArray[sigCtxOffsetNext + MIN((sumAbs1 + 1) >> 1, 3)][1];
+      state->m_sigFracBits[curr_state_offset][0] = state->m_sigFracBitsArray[curr_state_offset][sigCtxOffsetNext + MIN((sumAbs1 + 1) >> 1, 3)][0];
+      state->m_sigFracBits[curr_state_offset][1] = state->m_sigFracBitsArray[curr_state_offset][sigCtxOffsetNext + MIN((sumAbs1 + 1) >> 1, 3)][1];
       
-      memcpy(state->m_coeffFracBits, state->m_gtxFracBitsArray[gtxCtxOffsetNext + (sumGt1 < 4 ? sumGt1 : 4)], sizeof(state->m_coeffFracBits));
+      memcpy(state->m_coeffFracBits[curr_state_offset], state->m_gtxFracBitsArray[gtxCtxOffsetNext + (sumGt1 < 4 ? sumGt1 : 4)], sizeof(state->m_coeffFracBits[0]));
     }
 }
 
 static INLINE void updateState(
-  depquant_state*       state,
-  int                   numIPos,
-  const uint32_t        scan_pos,
-  const depquant_state* prevStates,
-  const Decision*       decisions,
-  const uint32_t        sigCtxOffsetNext,
-  const uint32_t        gtxCtxOffsetNext,
-  const NbInfoSbb       next_nb_info_ssb,
-  const int             baseLevel,
-  const bool            extRiceFlag,
-  int                   decision_id) {
-    state->m_rdCost = decisions->rdCost[decision_id];
+  context_store * ctxs,
+  int             numIPos,
+  const uint32_t  scan_pos,
+  const Decision* decisions,
+  const uint32_t  sigCtxOffsetNext,
+  const uint32_t  gtxCtxOffsetNext,
+  const NbInfoSbb next_nb_info_ssb,
+  const int       baseLevel,
+  const bool      extRiceFlag,
+  int             decision_id) {
+    all_depquant_states* state = &ctxs->m_allStates;
+    int state_id = ctxs->m_curr_state_offset + decision_id;
+    state->m_rdCost[state_id] = decisions->rdCost[decision_id];
     if (decisions->prevId[decision_id] > -2)
     {
         if (decisions->prevId[decision_id] >= 0)
         {
-            const depquant_state* prvState = prevStates + decisions->prevId[decision_id];
-            state->m_numSigSbb = prvState->m_numSigSbb + !!decisions->absLevel[decision_id];
-            state->m_refSbbCtxId = prvState->m_refSbbCtxId;
-            state->m_sbbFracBits[0] = prvState->m_sbbFracBits[0];
-            state->m_sbbFracBits[1] = prvState->m_sbbFracBits[1];
-            state->m_remRegBins = prvState->m_remRegBins - 1;
-            state->m_goRicePar = prvState->m_goRicePar;
-            if (state->m_remRegBins >= 4)
+            const int prvState = ctxs->m_prev_state_offset + decisions->prevId[decision_id];
+            state->m_numSigSbb[state_id] = (state->m_numSigSbb[prvState]) + !!decisions->absLevel[decision_id];
+            state->m_refSbbCtxId[state_id] = state->m_refSbbCtxId[prvState];
+            state->m_sbbFracBits[state_id][0] = state->m_sbbFracBits[prvState][0];
+            state->m_sbbFracBits[state_id][1] = state->m_sbbFracBits[prvState][1];
+            state->m_remRegBins[state_id] = state->m_remRegBins[prvState] - 1;
+            state->m_goRicePar[state_id] = state->m_goRicePar[prvState];
+            if (state->m_remRegBins[state_id] >= 4)
             {
-                state->m_remRegBins -= (decisions->absLevel[decision_id] < 2 ? (unsigned)decisions->absLevel[decision_id] : 3);
+                state->m_remRegBins[state_id] -= (decisions->absLevel[decision_id] < 2 ? (unsigned)decisions->absLevel[decision_id] : 3);
             }
-            memcpy(state->m_absLevelsAndCtxInit, prvState->m_absLevelsAndCtxInit, 48 * sizeof(uint8_t));
+            memcpy(state->m_absLevelsAndCtxInit[state_id], state->m_absLevelsAndCtxInit[prvState], 48 * sizeof(uint8_t));
         }
         else
         {
-            state->m_numSigSbb = 1;
-            state->m_refSbbCtxId = -1;
+            state->m_numSigSbb[state_id] = 1;
+            state->m_refSbbCtxId[state_id] = -1;
             int ctxBinSampleRatio = 28; //(scanInfo.chType == CHANNEL_TYPE_LUMA) ? MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_LUMA : MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_CHROMA;
-            state->m_remRegBins = (state->effWidth * state->effHeight * ctxBinSampleRatio) / 16 - (decisions->absLevel[decision_id] < 2 ? (unsigned)decisions->absLevel[decision_id] : 3);
-            memset(state->m_absLevelsAndCtxInit, 0, 48 * sizeof(uint8_t));
+            state->m_remRegBins[state_id] = (state->effWidth * state->effHeight * ctxBinSampleRatio) / 16 - (decisions->absLevel[decision_id] < 2 ? (unsigned)decisions->absLevel[decision_id] : 3);
+            memset(state->m_absLevelsAndCtxInit[state_id], 0, 48 * sizeof(uint8_t));
         }
 
-        uint8_t* levels = (uint8_t*)(state->m_absLevelsAndCtxInit);
+        uint8_t* levels = (uint8_t*)(state->m_absLevelsAndCtxInit[state_id]);
         levels[scan_pos & 15] = (uint8_t)MIN(255, decisions->absLevel[decision_id]);
 
-        if (state->m_remRegBins >= 4)
+        if (state->m_remRegBins[state_id] >= 4)
         {
-            coeff_t  tinit = state->m_absLevelsAndCtxInit[8 + ((scan_pos - 1) & 15)];
+            coeff_t  tinit = state->m_absLevelsAndCtxInit[state_id][8 + ((scan_pos - 1) & 15)];
             coeff_t  sumAbs1 = (tinit >> 3) & 31;
             coeff_t sumNum = tinit & 7;
 #define UPDATE(k) {coeff_t t=levels[next_nb_info_ssb.inPos[k]]; sumAbs1+=MIN(4+(t&1),t); sumNum+=!!t; }
@@ -961,12 +985,12 @@ static INLINE void updateState(
             }
 #undef UPDATE
             coeff_t sumGt1 = sumAbs1 - sumNum;
-            state->m_sigFracBits[0] = state->m_sigFracBitsArray[sigCtxOffsetNext + MIN((sumAbs1 + 1) >> 1, 3)][0];
-            state->m_sigFracBits[1] = state->m_sigFracBitsArray[sigCtxOffsetNext + MIN((sumAbs1 + 1) >> 1, 3)][1];
-            memcpy(state->m_coeffFracBits, state->m_gtxFracBitsArray[gtxCtxOffsetNext + (sumGt1 < 4 ? sumGt1 : 4)], sizeof(state->m_coeffFracBits));
+            state->m_sigFracBits[state_id][0] = state->m_sigFracBitsArray[state_id][sigCtxOffsetNext + MIN((sumAbs1 + 1) >> 1, 3)][0];
+            state->m_sigFracBits[state_id][1] = state->m_sigFracBitsArray[state_id][sigCtxOffsetNext + MIN((sumAbs1 + 1) >> 1, 3)][1];
+            memcpy(state->m_coeffFracBits[state_id], state->m_gtxFracBitsArray[gtxCtxOffsetNext + (sumGt1 < 4 ? sumGt1 : 4)], sizeof(state->m_coeffFracBits[0]));
             
 
-            coeff_t  sumAbs = state->m_absLevelsAndCtxInit[8 + ((scan_pos - 1) & 15)] >> 8;
+            coeff_t  sumAbs = state->m_absLevelsAndCtxInit[state_id][8 + ((scan_pos - 1) & 15)] >> 8;
 #define UPDATE(k) {coeff_t t=levels[next_nb_info_ssb.inPos[k]]; sumAbs+=t; }
             if (numIPos == 1)
             {
@@ -1004,18 +1028,18 @@ static INLINE void updateState(
                 unsigned currentShift = templateAbsCompare(sumAbs);
                 sumAbs = sumAbs >> currentShift;
                 int sumAll = MAX(MIN(31, (int)sumAbs - (int)baseLevel), 0);
-                state->m_goRicePar = g_goRiceParsCoeff[sumAll];
-                state->m_goRicePar += currentShift;
+                state->m_goRicePar[state_id] = g_goRiceParsCoeff[sumAll];
+                state->m_goRicePar[state_id] += currentShift;
             }
             else
             {
                 int sumAll = MAX(MIN(31, (int)sumAbs - 4 * 5), 0);
-                state->m_goRicePar = g_goRiceParsCoeff[sumAll];
+                state->m_goRicePar[state_id] = g_goRiceParsCoeff[sumAll];
             }
         }
         else
         {
-            coeff_t  sumAbs = state->m_absLevelsAndCtxInit[8 + ((scan_pos - 1) & 15)] >> 8;
+            coeff_t  sumAbs = (state->m_absLevelsAndCtxInit[state_id][8 + ((scan_pos - 1) & 15)]) >> 8;
 #define UPDATE(k) {coeff_t t=levels[next_nb_info_ssb.inPos[k]]; sumAbs+=t; }
             if (numIPos == 1)
             {
@@ -1053,19 +1077,20 @@ static INLINE void updateState(
                 unsigned currentShift = templateAbsCompare(sumAbs);
                 sumAbs = sumAbs >> currentShift;
                 sumAbs = MIN(31, sumAbs);
-                state->m_goRicePar = g_goRiceParsCoeff[sumAbs];
-                state->m_goRicePar += currentShift;
+                state->m_goRicePar[state_id] = g_goRiceParsCoeff[sumAbs];
+                state->m_goRicePar[state_id] += currentShift;
             }
             else
             {
                 sumAbs = MIN(31, sumAbs);
-                state->m_goRicePar = g_goRiceParsCoeff[sumAbs];
+                state->m_goRicePar[state_id] = g_goRiceParsCoeff[sumAbs];
             }
-            state->m_goRiceZero = (state->m_stateId < 2 ? 1 : 2) << state->m_goRicePar; 
+            state->m_goRiceZero[state_id] = ((state_id & 3) < 2 ? 1 : 2) << state->m_goRicePar[state_id];
         }
     }
 }
 
+static bool same[13];
 static void xDecideAndUpdate(
   rate_estimator* re,
   context_store* ctxs,
@@ -1087,7 +1112,7 @@ static void xDecideAndUpdate(
   int effHeight)
 {
   Decision* decisions = &ctxs->m_trellis[scan_pos];
-  SWAP(ctxs->m_currStates, ctxs->m_prevStates, depquant_state*);
+  SWAP(ctxs->m_curr_state_offset, ctxs->m_prev_state_offset, int);
 
   enum ScanPosType spt = 0;
   if ((scan_pos & 15) == 15 && scan_pos > 16 && scan_pos < effHeight * effWidth - 1)
@@ -1099,28 +1124,28 @@ static void xDecideAndUpdate(
     spt = SCAN_EOCSBB;
   }
 
-  xDecide(ctxs->m_skipStates, ctxs->m_prevStates, &ctxs->m_startState, &ctxs->m_quant, spt, absCoeff, re->m_lastBitsX[pos_x] + re->m_lastBitsY[pos_y], decisions, zeroOut, quantCoeff);
+  xDecide(&ctxs->m_allStates, &ctxs->m_startState, &ctxs->m_quant, spt, absCoeff, re->m_lastBitsX[pos_x] + re->m_lastBitsY[pos_y], decisions, zeroOut, quantCoeff,ctxs->m_skip_state_offset, ctxs->m_prev_state_offset);
 
   if (scan_pos) {
     if (!(scan_pos & 15)) {
       SWAP(ctxs->m_common_context.m_currSbbCtx, ctxs->m_common_context.m_prevSbbCtx, SbbCtx*);
-      updateStateEOS(&ctxs->m_currStates[0], scan_pos, cg_pos, sigCtxOffsetNext, gtxCtxOffsetNext, width_in_sbb, height_in_sbb, next_sbb_right, next_sbb_below, ctxs->m_prevStates, ctxs->m_skipStates, decisions,0);
-      updateStateEOS(&ctxs->m_currStates[1], scan_pos, cg_pos, sigCtxOffsetNext, gtxCtxOffsetNext, width_in_sbb, height_in_sbb, next_sbb_right, next_sbb_below, ctxs->m_prevStates, ctxs->m_skipStates, decisions,1);
-      updateStateEOS(&ctxs->m_currStates[2], scan_pos, cg_pos, sigCtxOffsetNext, gtxCtxOffsetNext, width_in_sbb, height_in_sbb, next_sbb_right, next_sbb_below, ctxs->m_prevStates, ctxs->m_skipStates, decisions,2);
-      updateStateEOS(&ctxs->m_currStates[3], scan_pos, cg_pos, sigCtxOffsetNext, gtxCtxOffsetNext, width_in_sbb, height_in_sbb, next_sbb_right, next_sbb_below, ctxs->m_prevStates, ctxs->m_skipStates, decisions,3);
+      updateStateEOS(ctxs, scan_pos, cg_pos, sigCtxOffsetNext, gtxCtxOffsetNext, width_in_sbb, height_in_sbb, next_sbb_right, next_sbb_below, decisions, 0);
+      updateStateEOS(ctxs, scan_pos, cg_pos, sigCtxOffsetNext, gtxCtxOffsetNext, width_in_sbb, height_in_sbb, next_sbb_right, next_sbb_below, decisions, 1);
+      updateStateEOS(ctxs, scan_pos, cg_pos, sigCtxOffsetNext, gtxCtxOffsetNext, width_in_sbb, height_in_sbb, next_sbb_right, next_sbb_below, decisions, 2);
+      updateStateEOS(ctxs, scan_pos, cg_pos, sigCtxOffsetNext, gtxCtxOffsetNext, width_in_sbb, height_in_sbb, next_sbb_right, next_sbb_below, decisions, 3);
       memcpy(decisions->prevId + 4, decisions->prevId, 4 * sizeof(int));
       memcpy(decisions->absLevel + 4, decisions->absLevel, 4 * sizeof(coeff_t));
       memcpy(decisions->rdCost + 4, decisions->rdCost, 4 * sizeof(int64_t));
     } else if (!zeroOut) {
 
-      updateState(&ctxs->m_currStates[0], next_nb_info_ssb.num, scan_pos, ctxs->m_prevStates, decisions, sigCtxOffsetNext, gtxCtxOffsetNext, next_nb_info_ssb, 4, false, 0);
-      updateState(&ctxs->m_currStates[1], next_nb_info_ssb.num, scan_pos, ctxs->m_prevStates, decisions, sigCtxOffsetNext, gtxCtxOffsetNext, next_nb_info_ssb, 4, false, 1);
-      updateState(&ctxs->m_currStates[2], next_nb_info_ssb.num, scan_pos, ctxs->m_prevStates, decisions, sigCtxOffsetNext, gtxCtxOffsetNext, next_nb_info_ssb, 4, false, 2);
-      updateState(&ctxs->m_currStates[3], next_nb_info_ssb.num, scan_pos, ctxs->m_prevStates, decisions, sigCtxOffsetNext, gtxCtxOffsetNext, next_nb_info_ssb, 4, false, 3);
+      updateState(ctxs, next_nb_info_ssb.num, scan_pos, decisions, sigCtxOffsetNext, gtxCtxOffsetNext, next_nb_info_ssb, 4, false, 0);
+      updateState(ctxs, next_nb_info_ssb.num, scan_pos, decisions, sigCtxOffsetNext, gtxCtxOffsetNext, next_nb_info_ssb, 4, false, 1);
+      updateState(ctxs, next_nb_info_ssb.num, scan_pos, decisions, sigCtxOffsetNext, gtxCtxOffsetNext, next_nb_info_ssb, 4, false, 2);
+      updateState(ctxs, next_nb_info_ssb.num, scan_pos, decisions, sigCtxOffsetNext, gtxCtxOffsetNext, next_nb_info_ssb, 4, false, 3);
     }
 
     if (spt == SCAN_SOCSBB) {
-      SWAP(ctxs->m_skipStates, ctxs->m_prevStates, depquant_state*);
+      SWAP(ctxs->m_skip_state_offset, ctxs->m_prev_state_offset, int);
     }
   }
 }
@@ -1142,9 +1167,9 @@ int uvg_dep_quant(
   //===== reset / pre-init =====
   const int baseLevel = 4;
   context_store dep_quant_context;
-  dep_quant_context.m_currStates = &dep_quant_context.m_allStates[0];
-  dep_quant_context.m_prevStates = &dep_quant_context.m_allStates[4];
-  dep_quant_context.m_skipStates = &dep_quant_context.m_allStates[8];
+  dep_quant_context.m_curr_state_offset = 0;
+  dep_quant_context.m_prev_state_offset = 4;
+  dep_quant_context.m_skip_state_offset = 8;
   
   const uint32_t  lfnstIdx = tree_type != UVG_CHROMA_T  || compID == COLOR_Y ?
                                cur_tu->lfnst_idx :
@@ -1226,18 +1251,32 @@ int uvg_dep_quant(
   int effectHeight = MIN(32, effHeight);
   int effectWidth = MIN(32, effWidth);
   for (int k = 0; k < 12; k++) {
-    depquant_state_init(&dep_quant_context.m_allStates[k], rate_estimator.m_sigFracBits[0][0], rate_estimator.m_gtxFracBits[0]);
-    dep_quant_context.m_allStates[k].effHeight = effectHeight;
-    dep_quant_context.m_allStates[k].effWidth = effectWidth;
-    dep_quant_context.m_allStates[k].m_commonCtx = &dep_quant_context.m_common_context;
-    dep_quant_context.m_allStates[k].m_stateId = k & 3;
+    dep_quant_context.m_allStates.m_rdCost[k] = INT64_MAX >> 1;
+    dep_quant_context.m_allStates.m_numSigSbb[k] = 0;
+    dep_quant_context.m_allStates.m_remRegBins[k] = 4; // just large enough for last scan pos
+    dep_quant_context.m_allStates.m_refSbbCtxId[k] = -1;
+    dep_quant_context.m_allStates.m_sigFracBits[k][0] = rate_estimator.m_sigFracBits[0][0][0];
+    dep_quant_context.m_allStates.m_sigFracBits[k][1] = rate_estimator.m_sigFracBits[0][0][1];
+    memcpy(dep_quant_context.m_allStates.m_coeffFracBits[k], rate_estimator.m_gtxFracBits[0], sizeof(dep_quant_context.m_allStates.m_coeffFracBits[k]));
+    dep_quant_context.m_allStates.m_goRicePar[k] = 0;
+    dep_quant_context.m_allStates.m_goRiceZero[k] = 0;
+
+    dep_quant_context.m_allStates.m_sbbFracBits[k][0] = 0;
+    dep_quant_context.m_allStates.m_sbbFracBits[k][1] = 0;
+
+    dep_quant_context.m_allStates.m_stateId[k] = k & 3;
     for (int i = 0; i < (compID == COLOR_Y ? 12 : 8); ++i) {
-      dep_quant_context.m_allStates[k].m_sigFracBitsArray[i] = rate_estimator.m_sigFracBits[(k & 3 ? (k & 3) - 1 : 0)][i];
-    }
-    for (int i = 0; i < (compID == COLOR_Y ? 21 : 11); ++i) {
-      dep_quant_context.m_allStates[k].m_gtxFracBitsArray[i] = rate_estimator.m_gtxFracBits[i];
+      dep_quant_context.m_allStates.m_sigFracBitsArray[k][i] = rate_estimator.m_sigFracBits[(k & 3 ? (k & 3) - 1 : 0)][i];
     }
   }
+
+  dep_quant_context.m_allStates.effHeight = effectHeight;
+  dep_quant_context.m_allStates.effWidth = effectWidth;
+  dep_quant_context.m_allStates.m_commonCtx = &dep_quant_context.m_common_context;
+  for (int i = 0; i < (compID == COLOR_Y ? 21 : 11); ++i) {
+    dep_quant_context.m_allStates.m_gtxFracBitsArray[i] = rate_estimator.m_gtxFracBits[i];
+  }
+
   depquant_state_init(&dep_quant_context.m_startState, rate_estimator.m_sigFracBits[0][0], rate_estimator.m_gtxFracBits[0]);
   dep_quant_context.m_startState.effHeight = effectHeight;
   dep_quant_context.m_startState.effWidth = effectWidth;
@@ -1250,7 +1289,6 @@ int uvg_dep_quant(
     dep_quant_context.m_startState.m_gtxFracBitsArray[i] = rate_estimator.m_gtxFracBits[i];
   }
 
-
   const uint32_t height_in_sbb = MAX(height >> 2, 1);
   const uint32_t width_in_sbb = MAX(width >> 2, 1);
   //===== populate trellis =====
@@ -1320,16 +1358,18 @@ int uvg_dep_quant(
         width,
         height); //tu.cu->slice->getReverseLastSigCoeffFlag());
     }
-    //printf("%d\n", scanIdx);
-    //for(int i = 0; i < 4; i++) {
-    //  printf("%lld %hu %d\n", ctxs->m_trellis[scanIdx].rdCost[i], ctxs->m_trellis[scanIdx].absLevel[i], ctxs->m_trellis[scanIdx].prevId[i]);
-    //}
-    //printf("\n");
+    if(0){
+      printf("%d\n", scanIdx);
+      for (int i = 0; i < 4; i++) {
+        printf("%lld %hu %d\n", ctxs->m_trellis[scanIdx].rdCost[i], ctxs->m_trellis[scanIdx].absLevel[i], ctxs->m_trellis[scanIdx].prevId[i]);
+      }
+      printf("\n");
+    }
   }
 
   //===== find best path =====
   int prev_id    = -1;
-  int64_t  minPathCost = 0;
+  int64_t  minPathCost = 0;  
   for (int8_t stateId = 0; stateId < 4; stateId++) {
     int64_t pathCost = dep_quant_context.m_trellis[0].rdCost[stateId];
     if (pathCost < minPathCost) {
@@ -1340,6 +1380,12 @@ int uvg_dep_quant(
 
   //===== backward scanning =====
   int scanIdx = 0;
+  context_store* ctxs = &dep_quant_context;
+  //  printf("%d\n", scanIdx);
+  //for (int i = 0; i < 4; i++) {
+  //  printf("%lld %hu %d\n", ctxs->m_trellis[scanIdx].rdCost[i], ctxs->m_trellis[scanIdx].absLevel[i], ctxs->m_trellis[scanIdx].prevId[i]);
+  //}
+  //printf("\n");
   for (; prev_id >= 0; scanIdx++) {
     Decision temp       = dep_quant_context.m_trellis[scanIdx];
     int32_t blkpos = scan[scanIdx];

From 64d34f85591b3d6533eaa7927732905f61cc342a Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Thu, 6 Apr 2023 16:07:38 +0300
Subject: [PATCH 207/254] [depquant] AoS -> SoA pre quant

---
 src/dep_quant.c | 95 +++++++++++++++++++++++++------------------------
 1 file changed, 48 insertions(+), 47 deletions(-)

diff --git a/src/dep_quant.c b/src/dep_quant.c
index b6c246ea..d46dd61a 100644
--- a/src/dep_quant.c
+++ b/src/dep_quant.c
@@ -89,8 +89,8 @@ typedef struct
 
 typedef struct
 {
-  coeff_t absLevel;
-  int64_t deltaDist;
+  coeff_t absLevel[4];
+  int64_t deltaDist[4];
 } PQData;
 
 typedef struct
@@ -558,37 +558,38 @@ static void checkRdCosts(
   const all_depquant_states * const state,
   const enum ScanPosType            spt,
   const PQData *                    pqDataA,
-  const PQData *                    pqDataB,
   Decision *                        decisions,
   const int                         decisionA,
   const int                         decisionB,
   const int                         state_offset)
 {
+  const int pqA = decisionA && decisionB ? 3 : 0;
+    const int pqB = decisionA && decisionB ? 1 : 2;
     const int32_t* goRiceTab = g_goRiceBits[state->m_goRicePar[state_offset]];
-    int64_t         rdCostA = state->m_rdCost[state_offset] + pqDataA->deltaDist;
-    int64_t         rdCostB = state->m_rdCost[state_offset] + pqDataB->deltaDist;
+    int64_t         rdCostA = state->m_rdCost[state_offset] + pqDataA->deltaDist[pqA];
+    int64_t         rdCostB = state->m_rdCost[state_offset] + pqDataA->deltaDist[pqB];
     int64_t         rdCostZ = state->m_rdCost[state_offset];
     if (state->m_remRegBins[state_offset] >= 4)
     {
-        if (pqDataA->absLevel < 4)
+        if (pqDataA->absLevel[pqA] < 4)
         {
-            rdCostA += state->m_coeffFracBits[state_offset][pqDataA->absLevel];
+            rdCostA += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA]];
         }
         else
         {
-            const coeff_t value = (pqDataA->absLevel - 4) >> 1;
+            const coeff_t value = (pqDataA->absLevel[pqA] - 4) >> 1;
             rdCostA +=
-                state->m_coeffFracBits[state_offset][pqDataA->absLevel - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1];
+                state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA] - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1];
         }
-        if (pqDataB->absLevel < 4)
+        if (pqDataA->absLevel[pqB] < 4)
         {
-            rdCostB += state->m_coeffFracBits[state_offset][pqDataB->absLevel];
+            rdCostB += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqB]];
         }
         else
         {
-            const coeff_t value = (pqDataB->absLevel - 4) >> 1;
+            const coeff_t value = (pqDataA->absLevel[pqB] - 4) >> 1;
             rdCostB +=
-                state->m_coeffFracBits[state_offset][pqDataB->absLevel - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1];
+                state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqB] - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1];
         }
         if (spt == SCAN_ISCSBB)
         {
@@ -616,19 +617,19 @@ static void checkRdCosts(
     else
     {
         rdCostA +=
-            (1 << SCALE_BITS) + goRiceTab[pqDataA->absLevel <= state->m_goRiceZero[state_offset] 
-            ? pqDataA->absLevel - 1
-            : (pqDataA->absLevel < RICEMAX ? pqDataA->absLevel : RICEMAX - 1)];
+            (1 << SCALE_BITS) + goRiceTab[pqDataA->absLevel[pqA] <= state->m_goRiceZero[state_offset]
+            ? pqDataA->absLevel[pqA] - 1
+            : (pqDataA->absLevel[pqA] < RICEMAX ? pqDataA->absLevel[pqA] : RICEMAX - 1)];
         rdCostB +=
-            (1 << SCALE_BITS) + goRiceTab[pqDataB->absLevel <= state->m_goRiceZero[state_offset]
-            ? pqDataB->absLevel - 1
-            : (pqDataB->absLevel < RICEMAX ? pqDataB->absLevel : RICEMAX - 1)];
+            (1 << SCALE_BITS) + goRiceTab[pqDataA->absLevel[pqB] <= state->m_goRiceZero[state_offset]
+            ? pqDataA->absLevel[pqB] - 1
+            : (pqDataA->absLevel[pqB] < RICEMAX ? pqDataA->absLevel[pqB] : RICEMAX - 1)];
         rdCostZ += goRiceTab[state->m_goRiceZero[state_offset]];
     }
     if (rdCostA < decisions->rdCost[decisionA])
     {
         decisions->rdCost[decisionA] = rdCostA;
-        decisions->absLevel[decisionA] = pqDataA->absLevel;
+        decisions->absLevel[decisionA] = pqDataA->absLevel[pqA];
         decisions->prevId[decisionA] = state->m_stateId[state_offset];
     }
     if (rdCostZ < decisions->rdCost[decisionA])
@@ -640,7 +641,7 @@ static void checkRdCosts(
     if (rdCostB < decisions->rdCost[decisionB])
     {
         decisions->rdCost[decisionB] = rdCostB;
-        decisions->absLevel[decisionB] = pqDataB->absLevel;
+        decisions->absLevel[decisionB] = pqDataA->absLevel[pqB];
         decisions->prevId[decisionB] = state->m_stateId[state_offset];
     }
 }
@@ -659,20 +660,20 @@ static INLINE void checkRdCostSkipSbb(const all_depquant_states* const state, De
 static INLINE void checkRdCostStart(const depquant_state* const state, int32_t lastOffset, const PQData *pqData, Decision *decisions, int
                                     decision_id)
 {
-    int64_t rdCost = pqData->deltaDist + lastOffset;
-    if (pqData->absLevel < 4)
+    int64_t rdCost = pqData->deltaDist[decision_id] + lastOffset;
+    if (pqData->absLevel[decision_id] < 4)
     {
-        rdCost += state->m_coeffFracBits[pqData->absLevel];
+        rdCost += state->m_coeffFracBits[pqData->absLevel[decision_id]];
     }
     else
     {
-        const coeff_t value = (pqData->absLevel - 4) >> 1;
-        rdCost += state->m_coeffFracBits[pqData->absLevel - (value << 1)] + g_goRiceBits[state->m_goRicePar][value < RICEMAX ? value : RICEMAX - 1];
+        const coeff_t value = (pqData->absLevel[decision_id] - 4) >> 1;
+        rdCost += state->m_coeffFracBits[pqData->absLevel[decision_id] - (value << 1)] + g_goRiceBits[state->m_goRicePar][value < RICEMAX ? value : RICEMAX - 1];
     }
     if (rdCost < decisions->rdCost[decision_id])
     {
         decisions->rdCost[decision_id] = rdCost;
-        decisions->absLevel[decision_id] = pqData->absLevel;
+        decisions->absLevel[decision_id] = pqData->absLevel[decision_id];
         decisions->prevId[decision_id] = -1;
     }
 }
@@ -683,21 +684,21 @@ static INLINE void preQuantCoeff(const quant_block * const qp, const coeff_t abs
     int64_t scaledOrg = (int64_t)(absCoeff) * quanCoeff;
     coeff_t  qIdx = MAX(1, MIN(qp->m_maxQIdx, (coeff_t)((scaledOrg + qp->m_QAdd) >> qp->m_QShift)));
     int64_t scaledAdd = qIdx * qp->m_DistStepAdd - scaledOrg * qp->m_DistOrgFact;
-    PQData *pq_a = &pqData[qIdx & 3];
-    pq_a->deltaDist = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
-    pq_a->absLevel = (++qIdx) >> 1;
+    int index = qIdx & 3;
+    pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
+    pqData->absLevel[index] = (++qIdx) >> 1;
     scaledAdd += qp->m_DistStepAdd;
-    PQData *pq_b = &pqData[qIdx & 3];
-    pq_b->deltaDist = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
-    pq_b->absLevel = (++qIdx) >> 1;
+    index = qIdx & 3;
+    pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
+    pqData->absLevel[index] = (++qIdx) >> 1;
     scaledAdd += qp->m_DistStepAdd;
-    PQData *pq_c = &pqData[qIdx & 3];
-    pq_c->deltaDist = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
-    pq_c->absLevel = (++qIdx) >> 1;
+    index = qIdx & 3;
+    pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
+    pqData->absLevel[index] = (++qIdx) >> 1;
     scaledAdd += qp->m_DistStepAdd;
-    PQData *pq_d = &pqData[qIdx & 3];
-    pq_d->deltaDist = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
-    pq_d->absLevel = (++qIdx) >> 1;
+    index = qIdx & 3;
+    pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
+    pqData->absLevel[index] = (++qIdx) >> 1;
 }
 
 
@@ -732,12 +733,12 @@ static void xDecide(
         return;
     }
 
-    PQData  pqData[4];
-    preQuantCoeff(qp, absCoeff, pqData, quanCoeff);
-    checkRdCosts(all_states, spt, &pqData[0], &pqData[2], decisions, 0, 2, prev_offset + 0);
-    checkRdCosts(all_states, spt, &pqData[0], &pqData[2], decisions, 2, 0, prev_offset + 1);
-    checkRdCosts(all_states, spt, &pqData[3], &pqData[1], decisions, 1, 3, prev_offset + 2);
-    checkRdCosts(all_states, spt, &pqData[3], &pqData[1], decisions, 3, 1, prev_offset + 3);
+    PQData  pqData;
+    preQuantCoeff(qp, absCoeff, &pqData, quanCoeff);
+    checkRdCosts(all_states, spt, &pqData, decisions, 0, 2, prev_offset + 0);
+    checkRdCosts(all_states, spt, &pqData, decisions, 2, 0, prev_offset + 1);
+    checkRdCosts(all_states, spt, &pqData, decisions, 1, 3, prev_offset + 2);
+    checkRdCosts(all_states, spt, &pqData, decisions, 3, 1, prev_offset + 3);
     if (spt == SCAN_EOCSBB)
     {
         checkRdCostSkipSbb(all_states, decisions, 0, skip_offset);
@@ -746,8 +747,8 @@ static void xDecide(
         checkRdCostSkipSbb(all_states, decisions,3, skip_offset);
     }
 
-    checkRdCostStart(m_startState, lastOffset, &pqData[0], decisions, 0);
-    checkRdCostStart(m_startState, lastOffset, &pqData[2], decisions, 2);
+    checkRdCostStart(m_startState, lastOffset, &pqData, decisions, 0);
+    checkRdCostStart(m_startState, lastOffset, &pqData, decisions, 2);
 }
 
 

From 2912db5fca567fb89ac5ec6b7734fd4da8713a84 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Fri, 7 Apr 2023 08:14:06 +0300
Subject: [PATCH 208/254] [dep_quant.c] Small refactor

---
 src/dep_quant.c | 715 +++++++++++++++++++++++-------------------------
 1 file changed, 338 insertions(+), 377 deletions(-)

diff --git a/src/dep_quant.c b/src/dep_quant.c
index d46dd61a..f97dde4d 100644
--- a/src/dep_quant.c
+++ b/src/dep_quant.c
@@ -547,11 +547,15 @@ static void depquant_state_init(depquant_state* state, uint32_t sig_frac_bits[2]
   state->m_sbbFracBits[1] = 0;
 }
 
-static INLINE void checkRdCostSkipSbbZeroOut(Decision * decision, const all_depquant_states * const state, int decision_id, int skip_offset) {  
-    int64_t rdCost                    = state->m_rdCost[decision_id + skip_offset] + state->m_sbbFracBits[decision_id + skip_offset][0];
-    decision->rdCost[decision_id]   = rdCost;
-    decision->absLevel[decision_id] = 0;
-    decision->prevId[decision_id]   = 4 + state->m_stateId[decision_id + skip_offset];
+static INLINE void checkRdCostSkipSbbZeroOut(
+  Decision* decision, 
+  const all_depquant_states* const state,
+  int decision_id, 
+  int skip_offset) {
+  int64_t rdCost = state->m_rdCost[decision_id + skip_offset] + state->m_sbbFracBits[decision_id + skip_offset][0];
+  decision->rdCost[decision_id] = rdCost;
+  decision->absLevel[decision_id] = 0;
+  decision->prevId[decision_id] = 4 + state->m_stateId[decision_id + skip_offset];
 }
 
 static void checkRdCosts(
@@ -564,141 +568,126 @@ static void checkRdCosts(
   const int                         state_offset)
 {
   const int pqA = decisionA && decisionB ? 3 : 0;
-    const int pqB = decisionA && decisionB ? 1 : 2;
-    const int32_t* goRiceTab = g_goRiceBits[state->m_goRicePar[state_offset]];
-    int64_t         rdCostA = state->m_rdCost[state_offset] + pqDataA->deltaDist[pqA];
-    int64_t         rdCostB = state->m_rdCost[state_offset] + pqDataA->deltaDist[pqB];
-    int64_t         rdCostZ = state->m_rdCost[state_offset];
-    if (state->m_remRegBins[state_offset] >= 4)
-    {
-        if (pqDataA->absLevel[pqA] < 4)
-        {
-            rdCostA += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA]];
-        }
-        else
-        {
-            const coeff_t value = (pqDataA->absLevel[pqA] - 4) >> 1;
-            rdCostA +=
-                state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA] - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1];
-        }
-        if (pqDataA->absLevel[pqB] < 4)
-        {
-            rdCostB += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqB]];
-        }
-        else
-        {
-            const coeff_t value = (pqDataA->absLevel[pqB] - 4) >> 1;
-            rdCostB +=
-                state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqB] - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1];
-        }
-        if (spt == SCAN_ISCSBB)
-        {
-            rdCostA += state->m_sigFracBits[state_offset][1];
-            rdCostB += state->m_sigFracBits[state_offset][1];
-            rdCostZ += state->m_sigFracBits[state_offset][0];
-        }
-        else if (spt == SCAN_SOCSBB)
-        {
-            rdCostA += state->m_sbbFracBits[state_offset][1] + state->m_sigFracBits[state_offset][1];
-            rdCostB += state->m_sbbFracBits[state_offset][1] + state->m_sigFracBits[state_offset][1];
-            rdCostZ += state->m_sbbFracBits[state_offset][1] + state->m_sigFracBits[state_offset][0];
-        }
-        else if (state->m_numSigSbb[state_offset])
-        {
-            rdCostA += state->m_sigFracBits[state_offset][1];
-            rdCostB += state->m_sigFracBits[state_offset][1];
-            rdCostZ += state->m_sigFracBits[state_offset][0];
-        }
-        else
-        {
-            rdCostZ = decisions->rdCost[decisionA];
-        }
+  const int pqB = decisionA && decisionB ? 1 : 2;
+  const int32_t* goRiceTab = g_goRiceBits[state->m_goRicePar[state_offset]];
+  int64_t rdCostA = state->m_rdCost[state_offset] + pqDataA->deltaDist[pqA];
+  int64_t rdCostB = state->m_rdCost[state_offset] + pqDataA->deltaDist[pqB];
+  int64_t rdCostZ = state->m_rdCost[state_offset];
+  if (state->m_remRegBins[state_offset] >= 4) {
+    if (pqDataA->absLevel[pqA] < 4) {
+      rdCostA += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA]];
     }
-    else
-    {
-        rdCostA +=
-            (1 << SCALE_BITS) + goRiceTab[pqDataA->absLevel[pqA] <= state->m_goRiceZero[state_offset]
-            ? pqDataA->absLevel[pqA] - 1
-            : (pqDataA->absLevel[pqA] < RICEMAX ? pqDataA->absLevel[pqA] : RICEMAX - 1)];
-        rdCostB +=
-            (1 << SCALE_BITS) + goRiceTab[pqDataA->absLevel[pqB] <= state->m_goRiceZero[state_offset]
-            ? pqDataA->absLevel[pqB] - 1
-            : (pqDataA->absLevel[pqB] < RICEMAX ? pqDataA->absLevel[pqB] : RICEMAX - 1)];
-        rdCostZ += goRiceTab[state->m_goRiceZero[state_offset]];
+    else {
+      const coeff_t value = (pqDataA->absLevel[pqA] - 4) >> 1;
+      rdCostA +=
+        state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA] - (value << 1)] + goRiceTab[
+          value < RICEMAX ? value : RICEMAX - 1];
     }
-    if (rdCostA < decisions->rdCost[decisionA])
-    {
-        decisions->rdCost[decisionA] = rdCostA;
-        decisions->absLevel[decisionA] = pqDataA->absLevel[pqA];
-        decisions->prevId[decisionA] = state->m_stateId[state_offset];
+    if (pqDataA->absLevel[pqB] < 4) {
+      rdCostB += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqB]];
     }
-    if (rdCostZ < decisions->rdCost[decisionA])
-    {
-        decisions->rdCost[decisionA] = rdCostZ;
-        decisions->absLevel[decisionA] = 0;
-        decisions->prevId[decisionA] = state->m_stateId[state_offset];
+    else {
+      const coeff_t value = (pqDataA->absLevel[pqB] - 4) >> 1;
+      rdCostB +=
+        state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqB] - (value << 1)] + goRiceTab[
+          value < RICEMAX ? value : RICEMAX - 1];
     }
-    if (rdCostB < decisions->rdCost[decisionB])
-    {
-        decisions->rdCost[decisionB] = rdCostB;
-        decisions->absLevel[decisionB] = pqDataA->absLevel[pqB];
-        decisions->prevId[decisionB] = state->m_stateId[state_offset];
+    if (spt == SCAN_ISCSBB) {
+      rdCostA += state->m_sigFracBits[state_offset][1];
+      rdCostB += state->m_sigFracBits[state_offset][1];
+      rdCostZ += state->m_sigFracBits[state_offset][0];
     }
+    else if (spt == SCAN_SOCSBB) {
+      rdCostA += state->m_sbbFracBits[state_offset][1] + state->m_sigFracBits[state_offset][1];
+      rdCostB += state->m_sbbFracBits[state_offset][1] + state->m_sigFracBits[state_offset][1];
+      rdCostZ += state->m_sbbFracBits[state_offset][1] + state->m_sigFracBits[state_offset][0];
+    }
+    else if (state->m_numSigSbb[state_offset]) {
+      rdCostA += state->m_sigFracBits[state_offset][1];
+      rdCostB += state->m_sigFracBits[state_offset][1];
+      rdCostZ += state->m_sigFracBits[state_offset][0];
+    }
+    else {
+      rdCostZ = decisions->rdCost[decisionA];
+    }
+  }
+  else {
+    rdCostA += (1 << SCALE_BITS) + goRiceTab[pqDataA->absLevel[pqA] <= state->m_goRiceZero[state_offset]
+                                      ? pqDataA->absLevel[pqA] - 1
+                                      : (pqDataA->absLevel[pqA] < RICEMAX ? pqDataA->absLevel[pqA] : RICEMAX - 1)];
+    rdCostB += (1 << SCALE_BITS) + goRiceTab[pqDataA->absLevel[pqB] <= state->m_goRiceZero[state_offset]
+                                      ? pqDataA->absLevel[pqB] - 1
+                                      : (pqDataA->absLevel[pqB] < RICEMAX ? pqDataA->absLevel[pqB] : RICEMAX - 1)];
+    rdCostZ += goRiceTab[state->m_goRiceZero[state_offset]];
+  }
+  if (rdCostA < decisions->rdCost[decisionA]) {
+    decisions->rdCost[decisionA] = rdCostA;
+    decisions->absLevel[decisionA] = pqDataA->absLevel[pqA];
+    decisions->prevId[decisionA] = state->m_stateId[state_offset];
+  }
+  if (rdCostZ < decisions->rdCost[decisionA]) {
+    decisions->rdCost[decisionA] = rdCostZ;
+    decisions->absLevel[decisionA] = 0;
+    decisions->prevId[decisionA] = state->m_stateId[state_offset];
+  }
+  if (rdCostB < decisions->rdCost[decisionB]) {
+    decisions->rdCost[decisionB] = rdCostB;
+    decisions->absLevel[decisionB] = pqDataA->absLevel[pqB];
+    decisions->prevId[decisionB] = state->m_stateId[state_offset];
+  }
 }
 
 static INLINE void checkRdCostSkipSbb(const all_depquant_states* const state, Decision * decisions, int decision_id, int skip_offset)
 {
-    int64_t rdCost = state->m_rdCost[skip_offset + decision_id] + state->m_sbbFracBits[skip_offset + decision_id][0];
-    if (rdCost < decisions->rdCost[decision_id])
-    {
-        decisions->rdCost[decision_id] = rdCost;
-        decisions->absLevel[decision_id] = 0;
-        decisions->prevId[decision_id] = 4 + state->m_stateId[skip_offset + decision_id];
-    }
+  int64_t rdCost = state->m_rdCost[skip_offset + decision_id] + state->m_sbbFracBits[skip_offset + decision_id][0];
+  if (rdCost < decisions->rdCost[decision_id])
+  {
+    decisions->rdCost[decision_id] = rdCost;
+    decisions->absLevel[decision_id] = 0;
+    decisions->prevId[decision_id] = 4 + state->m_stateId[skip_offset + decision_id];
+  }
 }
 
 static INLINE void checkRdCostStart(const depquant_state* const state, int32_t lastOffset, const PQData *pqData, Decision *decisions, int
                                     decision_id)
 {
-    int64_t rdCost = pqData->deltaDist[decision_id] + lastOffset;
-    if (pqData->absLevel[decision_id] < 4)
-    {
-        rdCost += state->m_coeffFracBits[pqData->absLevel[decision_id]];
-    }
-    else
-    {
-        const coeff_t value = (pqData->absLevel[decision_id] - 4) >> 1;
-        rdCost += state->m_coeffFracBits[pqData->absLevel[decision_id] - (value << 1)] + g_goRiceBits[state->m_goRicePar][value < RICEMAX ? value : RICEMAX - 1];
-    }
-    if (rdCost < decisions->rdCost[decision_id])
-    {
-        decisions->rdCost[decision_id] = rdCost;
-        decisions->absLevel[decision_id] = pqData->absLevel[decision_id];
-        decisions->prevId[decision_id] = -1;
-    }
+  int64_t rdCost = pqData->deltaDist[decision_id] + lastOffset;
+  if (pqData->absLevel[decision_id] < 4) {
+    rdCost += state->m_coeffFracBits[pqData->absLevel[decision_id]];
+  }
+  else {
+    const coeff_t value = (pqData->absLevel[decision_id] - 4) >> 1;
+    rdCost += state->m_coeffFracBits[pqData->absLevel[decision_id] - (value << 1)]
+              + g_goRiceBits[state->m_goRicePar][value < RICEMAX ? value : RICEMAX - 1];
+  }
+  if (rdCost < decisions->rdCost[decision_id]) {
+    decisions->rdCost[decision_id] = rdCost;
+    decisions->absLevel[decision_id] = pqData->absLevel[decision_id];
+    decisions->prevId[decision_id] = -1;
+  }
 }
 
 
 static INLINE void preQuantCoeff(const quant_block * const qp, const coeff_t absCoeff, PQData* pqData, coeff_t quanCoeff)
 {
-    int64_t scaledOrg = (int64_t)(absCoeff) * quanCoeff;
-    coeff_t  qIdx = MAX(1, MIN(qp->m_maxQIdx, (coeff_t)((scaledOrg + qp->m_QAdd) >> qp->m_QShift)));
-    int64_t scaledAdd = qIdx * qp->m_DistStepAdd - scaledOrg * qp->m_DistOrgFact;
-    int index = qIdx & 3;
-    pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
-    pqData->absLevel[index] = (++qIdx) >> 1;
-    scaledAdd += qp->m_DistStepAdd;
-    index = qIdx & 3;
-    pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
-    pqData->absLevel[index] = (++qIdx) >> 1;
-    scaledAdd += qp->m_DistStepAdd;
-    index = qIdx & 3;
-    pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
-    pqData->absLevel[index] = (++qIdx) >> 1;
-    scaledAdd += qp->m_DistStepAdd;
-    index = qIdx & 3;
-    pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
-    pqData->absLevel[index] = (++qIdx) >> 1;
+  int64_t scaledOrg = (int64_t)(absCoeff) * quanCoeff;
+  coeff_t  qIdx = MAX(1, MIN(qp->m_maxQIdx, (coeff_t)((scaledOrg + qp->m_QAdd) >> qp->m_QShift)));
+  int64_t scaledAdd = qIdx * qp->m_DistStepAdd - scaledOrg * qp->m_DistOrgFact;
+  int index = qIdx & 3;
+  pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
+  pqData->absLevel[index] = (++qIdx) >> 1;
+  scaledAdd += qp->m_DistStepAdd;
+  index = qIdx & 3;
+  pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
+  pqData->absLevel[index] = (++qIdx) >> 1;
+  scaledAdd += qp->m_DistStepAdd;
+  index = qIdx & 3;
+  pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
+  pqData->absLevel[index] = (++qIdx) >> 1;
+  scaledAdd += qp->m_DistStepAdd;
+  index = qIdx & 3;
+  pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
+  pqData->absLevel[index] = (++qIdx) >> 1;
 }
 
 
@@ -719,63 +708,55 @@ static void xDecide(
   const int                  skip_offset,
   const int                  prev_offset)
 {
-    memcpy(decisions, &startDec, sizeof(Decision));
+  memcpy(decisions, &startDec, sizeof(Decision));
 
-    if (zeroOut)
-    {
-        if (spt == SCAN_EOCSBB)
-        {
-            checkRdCostSkipSbbZeroOut(decisions, all_states, 0, skip_offset);
-            checkRdCostSkipSbbZeroOut(decisions, all_states, 1, skip_offset);
-            checkRdCostSkipSbbZeroOut(decisions, all_states,2, skip_offset);
-            checkRdCostSkipSbbZeroOut(decisions, all_states,3, skip_offset);
-        }
-        return;
+  if (zeroOut) {
+    if (spt == SCAN_EOCSBB) {
+      checkRdCostSkipSbbZeroOut(decisions, all_states, 0, skip_offset);
+      checkRdCostSkipSbbZeroOut(decisions, all_states, 1, skip_offset);
+      checkRdCostSkipSbbZeroOut(decisions, all_states, 2, skip_offset);
+      checkRdCostSkipSbbZeroOut(decisions, all_states, 3, skip_offset);
     }
+    return;
+  }
 
-    PQData  pqData;
-    preQuantCoeff(qp, absCoeff, &pqData, quanCoeff);
-    checkRdCosts(all_states, spt, &pqData, decisions, 0, 2, prev_offset + 0);
-    checkRdCosts(all_states, spt, &pqData, decisions, 2, 0, prev_offset + 1);
-    checkRdCosts(all_states, spt, &pqData, decisions, 1, 3, prev_offset + 2);
-    checkRdCosts(all_states, spt, &pqData, decisions, 3, 1, prev_offset + 3);
-    if (spt == SCAN_EOCSBB)
-    {
-        checkRdCostSkipSbb(all_states, decisions, 0, skip_offset);
-        checkRdCostSkipSbb(all_states, decisions, 1, skip_offset);
-        checkRdCostSkipSbb(all_states, decisions,2, skip_offset);
-        checkRdCostSkipSbb(all_states, decisions,3, skip_offset);
-    }
+  PQData pqData;
+  preQuantCoeff(qp, absCoeff, &pqData, quanCoeff);
+  checkRdCosts(all_states, spt, &pqData, decisions, 0, 2, prev_offset + 0);
+  checkRdCosts(all_states, spt, &pqData, decisions, 2, 0, prev_offset + 1);
+  checkRdCosts(all_states, spt, &pqData, decisions, 1, 3, prev_offset + 2);
+  checkRdCosts(all_states, spt, &pqData, decisions, 3, 1, prev_offset + 3);
+  if (spt == SCAN_EOCSBB) {
+    checkRdCostSkipSbb(all_states, decisions, 0, skip_offset);
+    checkRdCostSkipSbb(all_states, decisions, 1, skip_offset);
+    checkRdCostSkipSbb(all_states, decisions, 2, skip_offset);
+    checkRdCostSkipSbb(all_states, decisions, 3, skip_offset);
+  }
 
-    checkRdCostStart(m_startState, lastOffset, &pqData, decisions, 0);
-    checkRdCostStart(m_startState, lastOffset, &pqData, decisions, 2);
+  checkRdCostStart(m_startState, lastOffset, &pqData, decisions, 0);
+  checkRdCostStart(m_startState, lastOffset, &pqData, decisions, 2);
 }
 
 
 static INLINE unsigned templateAbsCompare(coeff_t sum)
 {
-    int rangeIdx = 0;
-    if (sum < g_riceT[0])
-    {
-        rangeIdx = 0;
-    }
-    else if (sum < g_riceT[1])
-    {
-        rangeIdx = 1;
-    }
-    else if (sum < g_riceT[2])
-    {
-        rangeIdx = 2;
-    }
-    else if (sum < g_riceT[3])
-    {
-        rangeIdx = 3;
-    }
-    else
-    {
-        rangeIdx = 4;
-    }
-    return g_riceShift[rangeIdx];
+  int rangeIdx = 0;
+  if (sum < g_riceT[0]) {
+    rangeIdx = 0;
+  }
+  else if (sum < g_riceT[1]) {
+    rangeIdx = 1;
+  }
+  else if (sum < g_riceT[2]) {
+    rangeIdx = 2;
+  }
+  else if (sum < g_riceT[3]) {
+    rangeIdx = 3;
+  }
+  else {
+    rangeIdx = 4;
+  }
+  return g_riceShift[rangeIdx];
 }
 
 static INLINE void update_common_context(
@@ -866,43 +847,41 @@ static INLINE void updateStateEOS(
   const Decision * decisions,
   int              decision_id)
 {
-    all_depquant_states* state = &ctxs->m_allStates;
-    int curr_state_offset = ctxs->m_curr_state_offset + decision_id;
-    state->m_rdCost[curr_state_offset] = decisions->rdCost[decision_id];
-    if (decisions->prevId[decision_id] > -2)
-    {
-      int prvState = -1;
-      if (decisions->prevId[decision_id] >= 4)
-      {
-          prvState = ctxs->m_skip_state_offset + (decisions->prevId[decision_id] - 4);
-          state->m_numSigSbb[curr_state_offset] = 0;
-          memset(state->m_absLevelsAndCtxInit[curr_state_offset], 0, 16 * sizeof(uint8_t));
-      }
-      else if (decisions->prevId[decision_id] >= 0)
-      {
-          prvState = ctxs->m_prev_state_offset + decisions->prevId[decision_id];
-          state->m_numSigSbb[curr_state_offset] = state->m_numSigSbb[prvState] + !!decisions->absLevel[decision_id];
-          memcpy(state->m_absLevelsAndCtxInit[curr_state_offset], state->m_absLevelsAndCtxInit[prvState], 16 * sizeof(uint8_t));
-      }
-      else
-      {
-          state->m_numSigSbb[curr_state_offset] = 1;
-          memset(state->m_absLevelsAndCtxInit[curr_state_offset], 0, 16 * sizeof(uint8_t));
-      }
-      uint8_t* temp = (uint8_t*)(&state->m_absLevelsAndCtxInit[curr_state_offset][scan_pos & 15]);
-      *temp = (uint8_t)MIN(255, decisions->absLevel[decision_id]);
-
-      update_common_context(ctxs, state->m_commonCtx, scan_pos, cg_pos, width_in_sbb, height_in_sbb, next_sbb_right,next_sbb_below, prvState, ctxs->m_curr_state_offset + decision_id);
-      
-      coeff_t  tinit = state->m_absLevelsAndCtxInit[curr_state_offset][8 + ((scan_pos - 1) & 15)];
-      coeff_t  sumNum = tinit & 7;
-      coeff_t  sumAbs1 = (tinit >> 3) & 31;
-      coeff_t  sumGt1 = sumAbs1 - sumNum;
-      state->m_sigFracBits[curr_state_offset][0] = state->m_sigFracBitsArray[curr_state_offset][sigCtxOffsetNext + MIN((sumAbs1 + 1) >> 1, 3)][0];
-      state->m_sigFracBits[curr_state_offset][1] = state->m_sigFracBitsArray[curr_state_offset][sigCtxOffsetNext + MIN((sumAbs1 + 1) >> 1, 3)][1];
-      
-      memcpy(state->m_coeffFracBits[curr_state_offset], state->m_gtxFracBitsArray[gtxCtxOffsetNext + (sumGt1 < 4 ? sumGt1 : 4)], sizeof(state->m_coeffFracBits[0]));
+  all_depquant_states* state = &ctxs->m_allStates;
+  int curr_state_offset = ctxs->m_curr_state_offset + decision_id;
+  state->m_rdCost[curr_state_offset] = decisions->rdCost[decision_id];
+  if (decisions->prevId[decision_id] > -2) {
+    int prvState = -1;
+    if (decisions->prevId[decision_id] >= 4) {
+      prvState = ctxs->m_skip_state_offset + (decisions->prevId[decision_id] - 4);
+      state->m_numSigSbb[curr_state_offset] = 0;
+      memset(state->m_absLevelsAndCtxInit[curr_state_offset], 0, 16 * sizeof(uint8_t));
     }
+    else if (decisions->prevId[decision_id] >= 0) {
+      prvState = ctxs->m_prev_state_offset + decisions->prevId[decision_id];
+      state->m_numSigSbb[curr_state_offset] = state->m_numSigSbb[prvState] + !!decisions->absLevel[decision_id];
+      memcpy(state->m_absLevelsAndCtxInit[curr_state_offset], state->m_absLevelsAndCtxInit[prvState], 16 * sizeof(uint8_t));
+    }
+    else {
+      state->m_numSigSbb[curr_state_offset] = 1;
+      memset(state->m_absLevelsAndCtxInit[curr_state_offset], 0, 16 * sizeof(uint8_t));
+    }
+    uint8_t* temp = (uint8_t*)(&state->m_absLevelsAndCtxInit[curr_state_offset][scan_pos & 15]);
+    *temp = (uint8_t)MIN(255, decisions->absLevel[decision_id]);
+
+    update_common_context(ctxs, state->m_commonCtx, scan_pos, cg_pos, width_in_sbb, height_in_sbb, next_sbb_right,
+                          next_sbb_below, prvState, ctxs->m_curr_state_offset + decision_id);
+
+    coeff_t tinit = state->m_absLevelsAndCtxInit[curr_state_offset][8 + ((scan_pos - 1) & 15)];
+    coeff_t sumNum = tinit & 7;
+    coeff_t sumAbs1 = (tinit >> 3) & 31;
+    coeff_t sumGt1 = sumAbs1 - sumNum;
+    state->m_sigFracBits[curr_state_offset][0] = state->m_sigFracBitsArray[curr_state_offset][sigCtxOffsetNext + MIN((sumAbs1 + 1) >> 1, 3)][0];
+    state->m_sigFracBits[curr_state_offset][1] = state->m_sigFracBitsArray[curr_state_offset][sigCtxOffsetNext + MIN((sumAbs1 + 1) >> 1, 3)][1];
+
+    memcpy(state->m_coeffFracBits[curr_state_offset],
+           state->m_gtxFracBitsArray[gtxCtxOffsetNext + (sumGt1 < 4 ? sumGt1 : 4)], sizeof(state->m_coeffFracBits[0]));
+  }
 }
 
 static INLINE void updateState(
@@ -916,179 +895,161 @@ static INLINE void updateState(
   const int       baseLevel,
   const bool      extRiceFlag,
   int             decision_id) {
-    all_depquant_states* state = &ctxs->m_allStates;
-    int state_id = ctxs->m_curr_state_offset + decision_id;
-    state->m_rdCost[state_id] = decisions->rdCost[decision_id];
-    if (decisions->prevId[decision_id] > -2)
-    {
-        if (decisions->prevId[decision_id] >= 0)
-        {
-            const int prvState = ctxs->m_prev_state_offset + decisions->prevId[decision_id];
-            state->m_numSigSbb[state_id] = (state->m_numSigSbb[prvState]) + !!decisions->absLevel[decision_id];
-            state->m_refSbbCtxId[state_id] = state->m_refSbbCtxId[prvState];
-            state->m_sbbFracBits[state_id][0] = state->m_sbbFracBits[prvState][0];
-            state->m_sbbFracBits[state_id][1] = state->m_sbbFracBits[prvState][1];
-            state->m_remRegBins[state_id] = state->m_remRegBins[prvState] - 1;
-            state->m_goRicePar[state_id] = state->m_goRicePar[prvState];
-            if (state->m_remRegBins[state_id] >= 4)
-            {
-                state->m_remRegBins[state_id] -= (decisions->absLevel[decision_id] < 2 ? (unsigned)decisions->absLevel[decision_id] : 3);
-            }
-            memcpy(state->m_absLevelsAndCtxInit[state_id], state->m_absLevelsAndCtxInit[prvState], 48 * sizeof(uint8_t));
-        }
-        else
-        {
-            state->m_numSigSbb[state_id] = 1;
-            state->m_refSbbCtxId[state_id] = -1;
-            int ctxBinSampleRatio = 28; //(scanInfo.chType == CHANNEL_TYPE_LUMA) ? MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_LUMA : MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_CHROMA;
-            state->m_remRegBins[state_id] = (state->effWidth * state->effHeight * ctxBinSampleRatio) / 16 - (decisions->absLevel[decision_id] < 2 ? (unsigned)decisions->absLevel[decision_id] : 3);
-            memset(state->m_absLevelsAndCtxInit[state_id], 0, 48 * sizeof(uint8_t));
-        }
-
-        uint8_t* levels = (uint8_t*)(state->m_absLevelsAndCtxInit[state_id]);
-        levels[scan_pos & 15] = (uint8_t)MIN(255, decisions->absLevel[decision_id]);
-
-        if (state->m_remRegBins[state_id] >= 4)
-        {
-            coeff_t  tinit = state->m_absLevelsAndCtxInit[state_id][8 + ((scan_pos - 1) & 15)];
-            coeff_t  sumAbs1 = (tinit >> 3) & 31;
-            coeff_t sumNum = tinit & 7;
-#define UPDATE(k) {coeff_t t=levels[next_nb_info_ssb.inPos[k]]; sumAbs1+=MIN(4+(t&1),t); sumNum+=!!t; }
-            if (numIPos == 1)
-            {
-                UPDATE(0);
-            }
-            else if (numIPos == 2)
-            {
-                UPDATE(0);
-                UPDATE(1);
-            }
-            else if (numIPos == 3)
-            {
-                UPDATE(0);
-                UPDATE(1);
-                UPDATE(2);
-            }
-            else if (numIPos == 4)
-            {
-                UPDATE(0);
-                UPDATE(1);
-                UPDATE(2);
-                UPDATE(3);
-            }
-            else if (numIPos == 5)
-            {
-                UPDATE(0);
-                UPDATE(1);
-                UPDATE(2);
-                UPDATE(3);
-                UPDATE(4);
-            }
-#undef UPDATE
-            coeff_t sumGt1 = sumAbs1 - sumNum;
-            state->m_sigFracBits[state_id][0] = state->m_sigFracBitsArray[state_id][sigCtxOffsetNext + MIN((sumAbs1 + 1) >> 1, 3)][0];
-            state->m_sigFracBits[state_id][1] = state->m_sigFracBitsArray[state_id][sigCtxOffsetNext + MIN((sumAbs1 + 1) >> 1, 3)][1];
-            memcpy(state->m_coeffFracBits[state_id], state->m_gtxFracBitsArray[gtxCtxOffsetNext + (sumGt1 < 4 ? sumGt1 : 4)], sizeof(state->m_coeffFracBits[0]));
-            
-
-            coeff_t  sumAbs = state->m_absLevelsAndCtxInit[state_id][8 + ((scan_pos - 1) & 15)] >> 8;
-#define UPDATE(k) {coeff_t t=levels[next_nb_info_ssb.inPos[k]]; sumAbs+=t; }
-            if (numIPos == 1)
-            {
-                UPDATE(0);
-            }
-            else if (numIPos == 2)
-            {
-                UPDATE(0);
-                UPDATE(1);
-            }
-            else if (numIPos == 3)
-            {
-                UPDATE(0);
-                UPDATE(1);
-                UPDATE(2);
-            }
-            else if (numIPos == 4)
-            {
-                UPDATE(0);
-                UPDATE(1);
-                UPDATE(2);
-                UPDATE(3);
-            }
-            else if (numIPos == 5)
-            {
-                UPDATE(0);
-                UPDATE(1);
-                UPDATE(2);
-                UPDATE(3);
-                UPDATE(4);
-            }
-#undef UPDATE
-            if (extRiceFlag)
-            {
-                unsigned currentShift = templateAbsCompare(sumAbs);
-                sumAbs = sumAbs >> currentShift;
-                int sumAll = MAX(MIN(31, (int)sumAbs - (int)baseLevel), 0);
-                state->m_goRicePar[state_id] = g_goRiceParsCoeff[sumAll];
-                state->m_goRicePar[state_id] += currentShift;
-            }
-            else
-            {
-                int sumAll = MAX(MIN(31, (int)sumAbs - 4 * 5), 0);
-                state->m_goRicePar[state_id] = g_goRiceParsCoeff[sumAll];
-            }
-        }
-        else
-        {
-            coeff_t  sumAbs = (state->m_absLevelsAndCtxInit[state_id][8 + ((scan_pos - 1) & 15)]) >> 8;
-#define UPDATE(k) {coeff_t t=levels[next_nb_info_ssb.inPos[k]]; sumAbs+=t; }
-            if (numIPos == 1)
-            {
-                UPDATE(0);
-            }
-            else if (numIPos == 2)
-            {
-                UPDATE(0);
-                UPDATE(1);
-            }
-            else if (numIPos == 3)
-            {
-                UPDATE(0);
-                UPDATE(1);
-                UPDATE(2);
-            }
-            else if (numIPos == 4)
-            {
-                UPDATE(0);
-                UPDATE(1);
-                UPDATE(2);
-                UPDATE(3);
-            }
-            else if (numIPos == 5)
-            {
-                UPDATE(0);
-                UPDATE(1);
-                UPDATE(2);
-                UPDATE(3);
-                UPDATE(4);
-            }
-#undef UPDATE
-            if (extRiceFlag)
-            {
-                unsigned currentShift = templateAbsCompare(sumAbs);
-                sumAbs = sumAbs >> currentShift;
-                sumAbs = MIN(31, sumAbs);
-                state->m_goRicePar[state_id] = g_goRiceParsCoeff[sumAbs];
-                state->m_goRicePar[state_id] += currentShift;
-            }
-            else
-            {
-                sumAbs = MIN(31, sumAbs);
-                state->m_goRicePar[state_id] = g_goRiceParsCoeff[sumAbs];
-            }
-            state->m_goRiceZero[state_id] = ((state_id & 3) < 2 ? 1 : 2) << state->m_goRicePar[state_id];
-        }
+  all_depquant_states* state = &ctxs->m_allStates;
+  int state_id = ctxs->m_curr_state_offset + decision_id;
+  state->m_rdCost[state_id] = decisions->rdCost[decision_id];
+  if (decisions->prevId[decision_id] > -2) {
+    if (decisions->prevId[decision_id] >= 0) {
+      const int prvState = ctxs->m_prev_state_offset + decisions->prevId[decision_id];
+      state->m_numSigSbb[state_id] = (state->m_numSigSbb[prvState]) + !!decisions->absLevel[decision_id];
+      state->m_refSbbCtxId[state_id] = state->m_refSbbCtxId[prvState];
+      state->m_sbbFracBits[state_id][0] = state->m_sbbFracBits[prvState][0];
+      state->m_sbbFracBits[state_id][1] = state->m_sbbFracBits[prvState][1];
+      state->m_remRegBins[state_id] = state->m_remRegBins[prvState] - 1;
+      state->m_goRicePar[state_id] = state->m_goRicePar[prvState];
+      if (state->m_remRegBins[state_id] >= 4) {
+        state->m_remRegBins[state_id] -= (decisions->absLevel[decision_id] < 2
+                                            ? (unsigned)decisions->absLevel[decision_id]
+                                            : 3);
+      }
+      memcpy(state->m_absLevelsAndCtxInit[state_id], state->m_absLevelsAndCtxInit[prvState], 48 * sizeof(uint8_t));
     }
+    else {
+      state->m_numSigSbb[state_id] = 1;
+      state->m_refSbbCtxId[state_id] = -1;
+      int ctxBinSampleRatio = 28;
+      //(scanInfo.chType == CHANNEL_TYPE_LUMA) ? MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_LUMA : MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_CHROMA;
+      state->m_remRegBins[state_id] = (state->effWidth * state->effHeight * ctxBinSampleRatio) / 16 - (
+        decisions->absLevel[decision_id] < 2 ? (unsigned)decisions->absLevel[decision_id] : 3);
+      memset(state->m_absLevelsAndCtxInit[state_id], 0, 48 * sizeof(uint8_t));
+    }
+
+    uint8_t* levels = (uint8_t*)(state->m_absLevelsAndCtxInit[state_id]);
+    levels[scan_pos & 15] = (uint8_t)MIN(255, decisions->absLevel[decision_id]);
+
+    if (state->m_remRegBins[state_id] >= 4) {
+      coeff_t tinit = state->m_absLevelsAndCtxInit[state_id][8 + ((scan_pos - 1) & 15)];
+      coeff_t sumAbs1 = (tinit >> 3) & 31;
+      coeff_t sumNum = tinit & 7;
+#define UPDATE(k) {coeff_t t=levels[next_nb_info_ssb.inPos[k]]; sumAbs1+=MIN(4+(t&1),t); sumNum+=!!t; }
+      if (numIPos == 1) {
+        UPDATE(0);
+      }
+      else if (numIPos == 2) {
+        UPDATE(0);
+        UPDATE(1);
+      }
+      else if (numIPos == 3) {
+        UPDATE(0);
+        UPDATE(1);
+        UPDATE(2);
+      }
+      else if (numIPos == 4) {
+        UPDATE(0);
+        UPDATE(1);
+        UPDATE(2);
+        UPDATE(3);
+      }
+      else if (numIPos == 5) {
+        UPDATE(0);
+        UPDATE(1);
+        UPDATE(2);
+        UPDATE(3);
+        UPDATE(4);
+      }
+#undef UPDATE
+      coeff_t sumGt1 = sumAbs1 - sumNum;
+      state->m_sigFracBits[state_id][0] = state->m_sigFracBitsArray[state_id][sigCtxOffsetNext + MIN(
+        (sumAbs1 + 1) >> 1, 3)][0];
+      state->m_sigFracBits[state_id][1] = state->m_sigFracBitsArray[state_id][sigCtxOffsetNext + MIN(
+        (sumAbs1 + 1) >> 1, 3)][1];
+      memcpy(state->m_coeffFracBits[state_id], state->m_gtxFracBitsArray[gtxCtxOffsetNext + (sumGt1 < 4 ? sumGt1 : 4)],
+             sizeof(state->m_coeffFracBits[0]));
+
+
+      coeff_t sumAbs = state->m_absLevelsAndCtxInit[state_id][8 + ((scan_pos - 1) & 15)] >> 8;
+#define UPDATE(k) {coeff_t t=levels[next_nb_info_ssb.inPos[k]]; sumAbs+=t; }
+      if (numIPos == 1) {
+        UPDATE(0);
+      }
+      else if (numIPos == 2) {
+        UPDATE(0);
+        UPDATE(1);
+      }
+      else if (numIPos == 3) {
+        UPDATE(0);
+        UPDATE(1);
+        UPDATE(2);
+      }
+      else if (numIPos == 4) {
+        UPDATE(0);
+        UPDATE(1);
+        UPDATE(2);
+        UPDATE(3);
+      }
+      else if (numIPos == 5) {
+        UPDATE(0);
+        UPDATE(1);
+        UPDATE(2);
+        UPDATE(3);
+        UPDATE(4);
+      }
+#undef UPDATE
+      if (extRiceFlag) {
+        unsigned currentShift = templateAbsCompare(sumAbs);
+        sumAbs = sumAbs >> currentShift;
+        int sumAll = MAX(MIN(31, (int)sumAbs - (int)baseLevel), 0);
+        state->m_goRicePar[state_id] = g_goRiceParsCoeff[sumAll];
+        state->m_goRicePar[state_id] += currentShift;
+      }
+      else {
+        int sumAll = MAX(MIN(31, (int)sumAbs - 4 * 5), 0);
+        state->m_goRicePar[state_id] = g_goRiceParsCoeff[sumAll];
+      }
+    }
+    else {
+      coeff_t sumAbs = (state->m_absLevelsAndCtxInit[state_id][8 + ((scan_pos - 1) & 15)]) >> 8;
+#define UPDATE(k) {coeff_t t=levels[next_nb_info_ssb.inPos[k]]; sumAbs+=t; }
+      if (numIPos == 1) {
+        UPDATE(0);
+      }
+      else if (numIPos == 2) {
+        UPDATE(0);
+        UPDATE(1);
+      }
+      else if (numIPos == 3) {
+        UPDATE(0);
+        UPDATE(1);
+        UPDATE(2);
+      }
+      else if (numIPos == 4) {
+        UPDATE(0);
+        UPDATE(1);
+        UPDATE(2);
+        UPDATE(3);
+      }
+      else if (numIPos == 5) {
+        UPDATE(0);
+        UPDATE(1);
+        UPDATE(2);
+        UPDATE(3);
+        UPDATE(4);
+      }
+#undef UPDATE
+      if (extRiceFlag) {
+        unsigned currentShift = templateAbsCompare(sumAbs);
+        sumAbs = sumAbs >> currentShift;
+        sumAbs = MIN(31, sumAbs);
+        state->m_goRicePar[state_id] = g_goRiceParsCoeff[sumAbs];
+        state->m_goRicePar[state_id] += currentShift;
+      }
+      else {
+        sumAbs = MIN(31, sumAbs);
+        state->m_goRicePar[state_id] = g_goRiceParsCoeff[sumAbs];
+      }
+      state->m_goRiceZero[state_id] = ((state_id & 3) < 2 ? 1 : 2) << state->m_goRicePar[state_id];
+    }
+  }
 }
 
 static bool same[13];

From 8caabcde1a16261d18799d61489e206dcb629119 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Fri, 7 Apr 2023 15:15:58 +0300
Subject: [PATCH 209/254] [avx2] WIP check_rd_costs_avx2

---
 src/dep_quant.c | 395 ++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 383 insertions(+), 12 deletions(-)

diff --git a/src/dep_quant.c b/src/dep_quant.c
index f97dde4d..2ff848b1 100644
--- a/src/dep_quant.c
+++ b/src/dep_quant.c
@@ -39,6 +39,7 @@
 #include "transform.h"
 #include "uvg_math.h"
 #include "generic/quant-generic.h"
+#include <immintrin.h>
 
 
 #define sm_numCtxSetsSig 3
@@ -89,7 +90,7 @@ typedef struct
 
 typedef struct
 {
-  coeff_t absLevel[4];
+  int32_t absLevel[4];
   int64_t deltaDist[4];
 } PQData;
 
@@ -558,6 +559,368 @@ static INLINE void checkRdCostSkipSbbZeroOut(
   decision->prevId[decision_id] = 4 + state->m_stateId[decision_id + skip_offset];
 }
 
+
+
+static void check_rd_costs_avx2(const all_depquant_states* const state, const enum ScanPosType spt, const PQData* pqDataA, Decision* decisions, int start)
+{
+  int32_t a[64] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64};
+  __m128i offsets = _mm_set_epi32(12, 8, 4, 0);
+  __m128i r = _mm_i32gather_epi32(a, offsets, 1);
+
+  int64_t temp_rd_cost_a[4] = {0, 0, 0, 0};
+  int64_t temp_rd_cost_b[4] = {0, 0, 0, 0};
+  int64_t temp_rd_cost_z[4] = {0, 0, 0, 0};
+
+  __m256i pq_a_delta_dist = _mm256_setr_epi64x(pqDataA->deltaDist[0], pqDataA->deltaDist[0], pqDataA->deltaDist[3], pqDataA->deltaDist[3]);
+  __m256i pq_b_delta_dist = _mm256_setr_epi64x(pqDataA->deltaDist[2], pqDataA->deltaDist[2], pqDataA->deltaDist[1], pqDataA->deltaDist[1]);
+
+  __m256i rd_cost_a = _mm256_loadu_si256(&state->m_rdCost[start]);
+  __m256i rd_cost_b = rd_cost_a;
+  __m256i rd_cost_z = rd_cost_a;
+
+  rd_cost_a = _mm256_add_epi64(rd_cost_a, pq_a_delta_dist);
+  rd_cost_b = _mm256_add_epi64(rd_cost_b, pq_b_delta_dist);
+
+  bool all_over_or_four = true;
+  bool all_under_four = true;
+  for (int i = 0; i < 4; i++) {
+    all_over_or_four &= state->m_remRegBins[start + i] >= 4;
+    all_under_four &= state->m_remRegBins[start + i] < 4;
+  }
+
+  if (all_over_or_four) {
+    if (pqDataA->absLevel[0] < 4 && pqDataA->absLevel[3] < 4) {
+      __m128i offsets = _mm_set_epi32(18 + pqDataA->absLevel[3], 12 + pqDataA->absLevel[3], 6 + pqDataA->absLevel[0], 0 + pqDataA->absLevel[0]);
+      __m128i coeff_frac_bits = _mm_i32gather_epi32(&state->m_coeffFracBits[start][0], offsets, 4);
+      __m256i ext_frac_bits = _mm256_cvtepi32_epi64(coeff_frac_bits);
+      rd_cost_a = _mm256_add_epi64(rd_cost_a, ext_frac_bits);
+    } else if (pqDataA->absLevel[0] >= 4 && pqDataA->absLevel[3] >= 4) {
+      __m128i value = _mm_set_epi32((pqDataA->absLevel[3] - 4) >> 1, (pqDataA->absLevel[3] - 4) >> 1, (pqDataA->absLevel[0] - 4) >> 1, (pqDataA->absLevel[0] - 4) >> 1);
+
+      __m128i offsets = _mm_set_epi32(18 + pqDataA->absLevel[3], 12 + pqDataA->absLevel[3], 6 + pqDataA->absLevel[0], 0 + pqDataA->absLevel[0]);
+      __m128i t = _mm_slli_epi32(value, 1);
+      offsets = _mm_sub_epi32(offsets, t);
+      __m128i coeff_frac_bits = _mm_i32gather_epi32(state->m_coeffFracBits[start], offsets, 1);
+
+      __m128i max_rice = _mm_set1_epi32(15);
+      value = _mm_min_epi32(value, max_rice);
+      __m128i go_rice_tab = _mm_cvtepi8_epi32(_mm_loadu_si32(&state->m_goRicePar[start]));
+      go_rice_tab = _mm_slli_epi32(value, 5);
+      value = _mm_add_epi32(value, go_rice_tab);
+
+      __m128i temp = _mm_add_epi32(coeff_frac_bits, _mm_i32gather_epi32(&g_goRiceBits[0][0], value, 1));
+      rd_cost_a = _mm256_add_epi64(rd_cost_a, _mm256_cvtepi32_epi64(temp));
+    } else {
+      const int pqAs[4] = {0, 0, 3, 3};
+      int64_t rd_costs[4] = {0, 0, 0, 0}; 
+      for (int i = 0; i < 4; i++) {
+        const int      state_offset = start + i;
+        const int      pqA = pqAs[i];
+        const int32_t* goRiceTab = g_goRiceBits[state->m_goRicePar[state_offset]];
+        if (pqDataA->absLevel[pqA] < 4) {
+          rd_costs[i] = state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA]];
+        } else {
+          const coeff_t value = (pqDataA->absLevel[pqA] - 4) >> 1;
+          rd_costs[i] += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA] - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1];
+        }
+      }
+      rd_cost_a = _mm256_add_epi64(rd_cost_a, _mm256_loadu_si256(&rd_costs[0]));
+    }
+
+    if (pqDataA->absLevel[1] < 4 && pqDataA->absLevel[2] < 4) {
+      __m128i offsets = _mm_set_epi32(18 + pqDataA->absLevel[1], 12 + pqDataA->absLevel[1], 6 + pqDataA->absLevel[2], 0 + pqDataA->absLevel[2]);
+      __m128i coeff_frac_bits = _mm_i32gather_epi32(state->m_coeffFracBits[start], offsets, 1);
+      __m256i ext_frac_bits = _mm256_cvtepi32_epi64(coeff_frac_bits);
+      rd_cost_b = _mm256_add_epi64(rd_cost_b, ext_frac_bits);
+    } else if (pqDataA->absLevel[1] >= 4 && pqDataA->absLevel[2] >= 4) {
+      __m128i value = _mm_set_epi32((pqDataA->absLevel[1] - 4) >> 1, (pqDataA->absLevel[1] - 4) >> 1, (pqDataA->absLevel[2] - 4) >> 1, (pqDataA->absLevel[2] - 4) >> 1);
+
+      __m128i offsets = _mm_set_epi32(18 + pqDataA->absLevel[1], 12 + pqDataA->absLevel[1], 6 + pqDataA->absLevel[2], 0 + pqDataA->absLevel[2]);
+      __m128i t = _mm_slli_epi32(value, 1);
+      offsets = _mm_sub_epi32(offsets, t);
+      __m128i coeff_frac_bits = _mm_i32gather_epi32(state->m_coeffFracBits[start], offsets, 1);
+
+      __m128i max_rice = _mm_set1_epi32(15);
+      value = _mm_min_epi32(value, max_rice);
+      __m128i go_rice_tab = _mm_cvtepi8_epi32(_mm_loadu_si32(&state->m_goRicePar[start]));
+      go_rice_tab = _mm_slli_epi32(go_rice_tab, 5);
+      value = _mm_add_epi32(value, go_rice_tab);
+
+      __m128i temp = _mm_add_epi32(coeff_frac_bits, _mm_i32gather_epi32(&g_goRiceBits[0][0], value, 1));
+      rd_cost_b = _mm256_add_epi64(rd_cost_b, _mm256_cvtepi32_epi64(temp));
+    } else {
+      const int pqAs[4] = {0, 0, 3, 3};
+      int64_t rd_costs[4] = {0, 0, 0, 0}; 
+      for (int i = 0; i < 4; i++) {
+        const int      state_offset = start + i;
+        const int      pqA = pqAs[i];
+        const int32_t* goRiceTab = g_goRiceBits[state->m_goRicePar[state_offset]];
+        if (pqDataA->absLevel[pqA] < 4) {
+          rd_costs[i] = state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA]];
+        } else {
+          const coeff_t value = (pqDataA->absLevel[pqA] - 4) >> 1;
+          rd_costs[i] += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA] - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1];
+        }
+      }
+      rd_cost_b = _mm256_add_epi64(rd_cost_b, _mm256_loadu_si256(&rd_costs[0]));
+    }
+
+    if (spt == SCAN_ISCSBB) {
+      __m256i original = _mm256_loadu_si256((__m256i const*)state->m_sigFracBits[start]);
+      __m256i even_mask = _mm256_setr_epi32(0, 2, 4, 6, -1, -1, -1, -1);
+      __m256i odd_mask = _mm256_setr_epi32(1, 3, 5, 7, -1, -1, -1, -1);
+      __m256i even = _mm256_permutevar8x32_epi32(original, even_mask);
+      __m256i odd = _mm256_permutevar8x32_epi32(original, odd_mask);
+      __m256i even_64 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(even, 0));
+      __m256i odd_64 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(odd, 1));
+      rd_cost_a = _mm256_add_epi64(rd_cost_a, odd_64);
+      rd_cost_b = _mm256_add_epi64(rd_cost_b, odd_64);
+      rd_cost_z = _mm256_add_epi64(rd_cost_z, even_64);
+    } else if (spt == SCAN_SOCSBB) {
+      __m256i original = _mm256_loadu_si256((__m256i const*)state->m_sigFracBits[start]);
+      __m256i even_mask = _mm256_setr_epi32(0, 2, 4, 6, -1, -1, -1, -1);
+      __m256i odd_mask = _mm256_setr_epi32(1, 3, 5, 7, -1, -1, -1, -1);
+      __m256i even = _mm256_permutevar8x32_epi32(original, even_mask);
+      __m256i odd = _mm256_permutevar8x32_epi32(original, odd_mask);
+      __m256i m_sigFracBits_0 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(even, 0));
+      __m256i m_sigFracBits_1 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(odd, 1));
+
+      original = _mm256_loadu_si256((__m256i const*)state->m_sbbFracBits[start]);
+      odd = _mm256_permutevar8x32_epi32(original, odd_mask);
+      __m256i m_sbbFracBits_1 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(odd, 1));
+
+      
+      rd_cost_a = _mm256_add_epi64(rd_cost_a, m_sbbFracBits_1);
+      rd_cost_b = _mm256_add_epi64(rd_cost_b, m_sbbFracBits_1);
+      rd_cost_z = _mm256_add_epi64(rd_cost_z, m_sbbFracBits_1);
+
+      rd_cost_a = _mm256_add_epi64(rd_cost_a, m_sigFracBits_1);
+      rd_cost_b = _mm256_add_epi64(rd_cost_b, m_sigFracBits_1);
+      rd_cost_z = _mm256_add_epi64(rd_cost_z, m_sigFracBits_0);
+    }
+    else {
+      if (state->m_numSigSbb[start] && state->m_numSigSbb[start + 1] && state->m_numSigSbb[start + 2] && state->m_numSigSbb[start + 3]) {
+        __m256i original = _mm256_loadu_si256((__m256i const*)state->m_sigFracBits[start]);
+        __m256i even_mask = _mm256_setr_epi32(0, 2, 4, 6, -1, -1, -1, -1);
+        __m256i odd_mask = _mm256_setr_epi32(1, 3, 5, 7, -1, -1, -1, -1);
+        __m256i even = _mm256_permutevar8x32_epi32(original, even_mask);
+        __m256i odd = _mm256_permutevar8x32_epi32(original, odd_mask);
+        __m256i even_64 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(even, 0));
+        __m256i odd_64 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(odd, 1));
+        rd_cost_a = _mm256_add_epi64(rd_cost_a, odd_64);
+        rd_cost_b = _mm256_add_epi64(rd_cost_b, odd_64);
+        rd_cost_z = _mm256_add_epi64(rd_cost_z, even_64);        
+      }
+      else if (!state->m_numSigSbb[start] && !state->m_numSigSbb[start + 1] && !state->m_numSigSbb[start + 2] && !state->m_numSigSbb[start + 3]) {
+        rd_cost_z = _mm256_setr_epi64x(decisions->rdCost[3], decisions->rdCost[3], decisions->rdCost[0], decisions->rdCost[0]);
+      }
+
+      else {
+        const int pqAs[4] = {0, 0, 3, 3};
+        int64_t temp_rd_cost_a[4] = {0, 0, 0, 0}; 
+        int64_t temp_rd_cost_b[4] = {0, 0, 0, 0}; 
+        int64_t temp_rd_cost_z[4] = {0, 0, 0, 0}; 
+        int64_t z_out[4] = {0, 0, 0, 0};
+        _mm256_storeu_epi64(z_out, rd_cost_z);
+        for (int i = 0; i < 4; i++) {
+          const int state_offset = start + i;
+          if (state->m_numSigSbb[state_offset]) {
+            temp_rd_cost_a[i] += state->m_sigFracBits[state_offset][1];
+            temp_rd_cost_b[i] += state->m_sigFracBits[state_offset][1];
+            temp_rd_cost_z[i] += state->m_sigFracBits[state_offset][0];
+          } else {
+            z_out[i] = decisions->rdCost[pqAs[i]];
+          }
+        }
+        rd_cost_z = _mm256_loadu_epi64(z_out);
+        rd_cost_a = _mm256_add_epi64(rd_cost_a, _mm256_loadu_epi64(temp_rd_cost_a));
+        rd_cost_b = _mm256_add_epi64(rd_cost_b, _mm256_loadu_epi64(temp_rd_cost_b));
+        rd_cost_z = _mm256_add_epi64(rd_cost_z, _mm256_loadu_epi64(temp_rd_cost_z));
+      }
+    }
+    _mm256_storeu_epi64(temp_rd_cost_a, rd_cost_a);
+    _mm256_storeu_epi64(temp_rd_cost_b, rd_cost_b);
+    _mm256_storeu_epi64(temp_rd_cost_z, rd_cost_z);
+  } else if (all_under_four) {
+    __m128i scale_bits = _mm_set1_epi32(1 << SCALE_BITS);
+    __m128i max_rice = _mm_set1_epi32(15);
+    __m128i go_rice_zero = _mm_cvtepi8_epi32(_mm_loadu_epi8(&state->m_goRiceZero[start]));
+    // RD cost A
+    {
+      __m128i pq_abs_a = _mm_set_epi32(pqDataA->absLevel[3], pqDataA->absLevel[3], pqDataA->absLevel[0], pqDataA->absLevel[0]);
+      __m128i cmp = _mm_cmplt_epi32(go_rice_zero, pq_abs_a);
+      
+      __m128i go_rice_smaller = _mm_min_epi32(pq_abs_a, max_rice);
+
+      __m128i other = _mm_sub_epi32(pq_abs_a, _mm_set1_epi32(1));
+
+      __m128i selected = _mm_blendv_epi8(go_rice_smaller, other, cmp);
+
+
+      __m128i go_rice_offset = _mm_cvtepi8_epi32(_mm_loadu_si32(&state->m_goRicePar[start]));
+      go_rice_offset = _mm_slli_epi32(go_rice_offset, 5);
+
+      __m128i offsets = _mm_add_epi32(selected, go_rice_offset);
+      __m128i go_rice_tab = _mm_i32gather_epi32(&g_goRiceBits[0][0], offsets, 1);
+      __m128i temp = _mm_add_epi32(go_rice_tab, scale_bits);
+
+      rd_cost_a = _mm256_add_epi64(rd_cost_a, _mm256_cvtepi32_epi64(temp));
+    }
+    // RD cost b
+    {
+      __m128i pq_abs_b = _mm_set_epi32(pqDataA->absLevel[1], pqDataA->absLevel[1], pqDataA->absLevel[2], pqDataA->absLevel[2]);
+      __m128i cmp = _mm_cmplt_epi32(go_rice_zero, pq_abs_b);
+
+      __m128i go_rice_smaller = _mm_min_epi32(pq_abs_b, max_rice);
+
+      __m128i other = _mm_sub_epi32(pq_abs_b, _mm_set1_epi32(1));
+
+      __m128i selected = _mm_blendv_epi8(go_rice_smaller, other, cmp);
+
+
+      __m128i go_rice_offset = _mm_cvtepi8_epi32(_mm_loadu_si32(&state->m_goRicePar[start]));
+      go_rice_offset = _mm_slli_epi32(go_rice_offset, 5);
+
+      __m128i offsets = _mm_add_epi32(selected, go_rice_offset);
+      __m128i go_rice_tab = _mm_i32gather_epi32(&g_goRiceBits[0][0], offsets, 1);
+      __m128i temp = _mm_add_epi32(go_rice_tab, scale_bits);
+
+      rd_cost_b = _mm256_add_epi64(rd_cost_b, _mm256_cvtepi32_epi64(temp));
+    }
+    // RD cost Z
+    {
+      __m128i go_rice_offset = _mm_cvtepi8_epi32(_mm_loadu_si32(&state->m_goRicePar[start]));
+      go_rice_offset = _mm_slli_epi32(go_rice_offset, 5);
+
+      go_rice_offset = _mm_add_epi32(go_rice_offset, go_rice_zero);
+      rd_cost_z = _mm256_add_epi64(rd_cost_z, _mm256_cvtepi32_epi64(go_rice_offset));
+    }
+    _mm256_storeu_epi64(temp_rd_cost_a, rd_cost_a);
+    _mm256_storeu_epi64(temp_rd_cost_b, rd_cost_b);
+    _mm256_storeu_epi64(temp_rd_cost_z, rd_cost_z);
+  } else {
+    const int pqAs[4] = {0, 0, 3, 3};
+    const int pqBs[4] = {2, 2, 1, 1};
+    const int decision_a[4] = {0, 2, 1, 3};
+    for (int i = 0; i < 4; i++) {
+      const int      state_offset = start + i;
+      const int32_t* goRiceTab = g_goRiceBits[state->m_goRicePar[state_offset]];
+      const int pqA = pqAs[i];
+      const int pqB = pqBs[i];
+      int64_t rdCostA = state->m_rdCost[state_offset] + pqDataA->deltaDist[pqA];
+      int64_t rdCostB = state->m_rdCost[state_offset] + pqDataA->deltaDist[pqB];
+      int64_t rdCostZ = state->m_rdCost[state_offset];
+      if (state->m_remRegBins[state_offset] >= 4) {
+        if (pqDataA->absLevel[pqA] < 4) {
+          rdCostA += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA]];
+        } else {
+          const coeff_t value = (pqDataA->absLevel[pqA] - 4) >> 1;
+          rdCostA += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA] - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1];
+        }
+        if (pqDataA->absLevel[pqB] < 4) {
+          rdCostB += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqB]];
+        } else {
+          const coeff_t value = (pqDataA->absLevel[pqB] - 4) >> 1;
+          rdCostB += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqB] - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1];
+        }
+        if (spt == SCAN_ISCSBB) {
+          rdCostA += state->m_sigFracBits[state_offset][1];
+          rdCostB += state->m_sigFracBits[state_offset][1];
+          rdCostZ += state->m_sigFracBits[state_offset][0];
+        } else if (spt == SCAN_SOCSBB) {
+          rdCostA += state->m_sbbFracBits[state_offset][1] + state->m_sigFracBits[state_offset][1];
+          rdCostB += state->m_sbbFracBits[state_offset][1] + state->m_sigFracBits[state_offset][1];
+          rdCostZ += state->m_sbbFracBits[state_offset][1] + state->m_sigFracBits[state_offset][0];
+        } else if (state->m_numSigSbb[state_offset]) {
+          rdCostA += state->m_sigFracBits[state_offset][1];
+          rdCostB += state->m_sigFracBits[state_offset][1];
+          rdCostZ += state->m_sigFracBits[state_offset][0];
+        } else {
+          rdCostZ = decisions->rdCost[decision_a[i]];
+        }
+      } else {
+        rdCostA += (1 << SCALE_BITS) + goRiceTab[pqDataA->absLevel[pqA] <= state->m_goRiceZero[state_offset] ? pqDataA->absLevel[pqA] - 1 : (pqDataA->absLevel[pqA] < RICEMAX ? pqDataA->absLevel[pqA] : RICEMAX - 1)];
+        rdCostB += (1 << SCALE_BITS) + goRiceTab[pqDataA->absLevel[pqB] <= state->m_goRiceZero[state_offset] ? pqDataA->absLevel[pqB] - 1 : (pqDataA->absLevel[pqB] < RICEMAX ? pqDataA->absLevel[pqB] : RICEMAX - 1)];
+        rdCostZ += goRiceTab[state->m_goRiceZero[state_offset]];
+      }
+      temp_rd_cost_a[i] = rdCostA;
+      temp_rd_cost_b[i] = rdCostB;
+      temp_rd_cost_z[i] = rdCostZ;
+    }
+    rd_cost_a = _mm256_loadu_epi64(temp_rd_cost_a);
+    rd_cost_b = _mm256_loadu_epi64(temp_rd_cost_b);
+    rd_cost_z = _mm256_loadu_epi64(temp_rd_cost_z);
+  }
+  // Decision 0
+  if (temp_rd_cost_a[0] < decisions->rdCost[0]) {
+    decisions->rdCost[0] = temp_rd_cost_a[0];
+    decisions->absLevel[0] = pqDataA->absLevel[0];
+    decisions->prevId[0] = state->m_stateId[start];    
+  }
+  if (temp_rd_cost_z[0] < decisions->rdCost[0]) {
+    decisions->rdCost[0] = temp_rd_cost_z[0];
+    decisions->absLevel[0] = 0;
+    decisions->prevId[0] = state->m_stateId[start];    
+  }
+  if (temp_rd_cost_b[1] < decisions->rdCost[0]) {
+    decisions->rdCost[0] = temp_rd_cost_b[1];
+    decisions->absLevel[0] = pqDataA->absLevel[2];
+    decisions->prevId[0] = state->m_stateId[start + 1];    
+  }
+
+  // Decision 1
+  if (temp_rd_cost_a[1] < decisions->rdCost[2]) {
+    decisions->rdCost[2] = temp_rd_cost_a[1];
+    decisions->absLevel[2] = pqDataA->absLevel[0];
+    decisions->prevId[2] = state->m_stateId[start + 1];    
+  }
+  if (temp_rd_cost_z[1] < decisions->rdCost[2]) {
+    decisions->rdCost[2] = temp_rd_cost_z[1];
+    decisions->absLevel[2] = 0;
+    decisions->prevId[2] = state->m_stateId[start + 1];    
+  }
+  if (temp_rd_cost_b[0] < decisions->rdCost[2]) {
+    decisions->rdCost[2] = temp_rd_cost_b[0];
+    decisions->absLevel[2] = pqDataA->absLevel[2];
+    decisions->prevId[2] = state->m_stateId[start];    
+  }
+
+  // Decision 2
+  if (temp_rd_cost_a[2] < decisions->rdCost[0]) {
+    decisions->rdCost[2] = temp_rd_cost_a[2];
+    decisions->absLevel[2] = pqDataA->absLevel[3];
+    decisions->prevId[2] = state->m_stateId[start + 2];    
+  }
+  if (temp_rd_cost_z[2] < decisions->rdCost[0]) {
+    decisions->rdCost[2] = temp_rd_cost_z[2];
+    decisions->absLevel[2] = 0;
+    decisions->prevId[2] = state->m_stateId[start + 2];    
+  }
+  if (temp_rd_cost_b[3] < decisions->rdCost[0]) {
+    decisions->rdCost[2] = temp_rd_cost_b[3];
+    decisions->absLevel[2] = pqDataA->absLevel[1];
+    decisions->prevId[2] = state->m_stateId[start + 3];    
+  }
+
+  // Decision 3
+  if (temp_rd_cost_a[3] < decisions->rdCost[1]) {
+    decisions->rdCost[3] = temp_rd_cost_a[3];
+    decisions->absLevel[3] = pqDataA->absLevel[3];
+    decisions->prevId[3] = state->m_stateId[start + 3];    
+  }
+  if (temp_rd_cost_z[3] < decisions->rdCost[1]) {
+    decisions->rdCost[3] = temp_rd_cost_z[3];
+    decisions->absLevel[3] = 0;
+    decisions->prevId[3] = state->m_stateId[start + 3];    
+  }
+  if (temp_rd_cost_b[2] < decisions->rdCost[1]) {
+    decisions->rdCost[3] = temp_rd_cost_b[2];
+    decisions->absLevel[3] = pqDataA->absLevel[1];
+    decisions->prevId[3] = state->m_stateId[start + 2];    
+  }
+}
+
+
 static void checkRdCosts(
   const all_depquant_states * const state,
   const enum ScanPosType            spt,
@@ -579,18 +942,14 @@ static void checkRdCosts(
     }
     else {
       const coeff_t value = (pqDataA->absLevel[pqA] - 4) >> 1;
-      rdCostA +=
-        state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA] - (value << 1)] + goRiceTab[
-          value < RICEMAX ? value : RICEMAX - 1];
+      rdCostA += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA] - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1];
     }
     if (pqDataA->absLevel[pqB] < 4) {
       rdCostB += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqB]];
     }
     else {
       const coeff_t value = (pqDataA->absLevel[pqB] - 4) >> 1;
-      rdCostB +=
-        state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqB] - (value << 1)] + goRiceTab[
-          value < RICEMAX ? value : RICEMAX - 1];
+      rdCostB += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqB] - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1];
     }
     if (spt == SCAN_ISCSBB) {
       rdCostA += state->m_sigFracBits[state_offset][1];
@@ -722,10 +1081,11 @@ static void xDecide(
 
   PQData pqData;
   preQuantCoeff(qp, absCoeff, &pqData, quanCoeff);
-  checkRdCosts(all_states, spt, &pqData, decisions, 0, 2, prev_offset + 0);
-  checkRdCosts(all_states, spt, &pqData, decisions, 2, 0, prev_offset + 1);
-  checkRdCosts(all_states, spt, &pqData, decisions, 1, 3, prev_offset + 2);
-  checkRdCosts(all_states, spt, &pqData, decisions, 3, 1, prev_offset + 3);
+  check_rd_costs_avx2(all_states, spt, &pqData, decisions, prev_offset);
+  //checkRdCosts(all_states, spt, &pqData, decisions, 0, 2, prev_offset + 0);
+  //checkRdCosts(all_states, spt, &pqData, decisions, 2, 0, prev_offset + 1);
+  //checkRdCosts(all_states, spt, &pqData, decisions, 1, 3, prev_offset + 2);
+  //checkRdCosts(all_states, spt, &pqData, decisions, 3, 1, prev_offset + 3);
   if (spt == SCAN_EOCSBB) {
     checkRdCostSkipSbb(all_states, decisions, 0, skip_offset);
     checkRdCostSkipSbb(all_states, decisions, 1, skip_offset);
@@ -1132,11 +1492,22 @@ int uvg_dep_quant(
   dep_quant_context.m_curr_state_offset = 0;
   dep_quant_context.m_prev_state_offset = 4;
   dep_quant_context.m_skip_state_offset = 8;
-  
+   
   const uint32_t  lfnstIdx = tree_type != UVG_CHROMA_T  || compID == COLOR_Y ?
                                cur_tu->lfnst_idx :
                                cur_tu->cr_lfnst_idx;
 
+  int8_t          t[4] = {2, 2, 2, 2};
+  __m128i         pq_abs_a = _mm_set_epi32(16, 0, 16, 0);
+  __m128i         go_rice_zero = _mm_cvtepi8_epi32(_mm_loadu_epi8(t));
+  __m128i         cmp = _mm_cmplt_epi32(go_rice_zero, pq_abs_a);
+
+  __m128i         max_rice = _mm_set1_epi32(15);
+  __m128i         go_rice_smaller = _mm_min_epi32(pq_abs_a, max_rice);
+
+  __m128i         other = _mm_sub_epi32(pq_abs_a, _mm_set1_epi32(1));
+  __m128i         selected = _mm_blendv_epi8(go_rice_zero, other, cmp);
+
   const int       numCoeff = width * height;
 
   memset(coeff_out, 0x00, width * height * sizeof(coeff_t));

From c6e6f5da339a1ad164d8d391446bbdad724bb1a8 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Sat, 8 Apr 2023 18:58:40 +0300
Subject: [PATCH 210/254] [avx2] WIP check_rd_costs_avx2, almost?

---
 src/dep_quant.c | 113 +++++++++++++++++++++++-------------------------
 1 file changed, 55 insertions(+), 58 deletions(-)

diff --git a/src/dep_quant.c b/src/dep_quant.c
index 2ff848b1..f272ad6e 100644
--- a/src/dep_quant.c
+++ b/src/dep_quant.c
@@ -563,10 +563,6 @@ static INLINE void checkRdCostSkipSbbZeroOut(
 
 static void check_rd_costs_avx2(const all_depquant_states* const state, const enum ScanPosType spt, const PQData* pqDataA, Decision* decisions, int start)
 {
-  int32_t a[64] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64};
-  __m128i offsets = _mm_set_epi32(12, 8, 4, 0);
-  __m128i r = _mm_i32gather_epi32(a, offsets, 1);
-
   int64_t temp_rd_cost_a[4] = {0, 0, 0, 0};
   int64_t temp_rd_cost_b[4] = {0, 0, 0, 0};
   int64_t temp_rd_cost_z[4] = {0, 0, 0, 0};
@@ -600,15 +596,15 @@ static void check_rd_costs_avx2(const all_depquant_states* const state, const en
       __m128i offsets = _mm_set_epi32(18 + pqDataA->absLevel[3], 12 + pqDataA->absLevel[3], 6 + pqDataA->absLevel[0], 0 + pqDataA->absLevel[0]);
       __m128i t = _mm_slli_epi32(value, 1);
       offsets = _mm_sub_epi32(offsets, t);
-      __m128i coeff_frac_bits = _mm_i32gather_epi32(state->m_coeffFracBits[start], offsets, 1);
+      __m128i coeff_frac_bits = _mm_i32gather_epi32(state->m_coeffFracBits[start], offsets, 4);
 
-      __m128i max_rice = _mm_set1_epi32(15);
+      __m128i max_rice = _mm_set1_epi32(31);
       value = _mm_min_epi32(value, max_rice);
       __m128i go_rice_tab = _mm_cvtepi8_epi32(_mm_loadu_si32(&state->m_goRicePar[start]));
       go_rice_tab = _mm_slli_epi32(value, 5);
       value = _mm_add_epi32(value, go_rice_tab);
 
-      __m128i temp = _mm_add_epi32(coeff_frac_bits, _mm_i32gather_epi32(&g_goRiceBits[0][0], value, 1));
+      __m128i temp = _mm_add_epi32(coeff_frac_bits, _mm_i32gather_epi32(&g_goRiceBits[0][0], value, 4));
       rd_cost_a = _mm256_add_epi64(rd_cost_a, _mm256_cvtepi32_epi64(temp));
     } else {
       const int pqAs[4] = {0, 0, 3, 3};
@@ -629,7 +625,7 @@ static void check_rd_costs_avx2(const all_depquant_states* const state, const en
 
     if (pqDataA->absLevel[1] < 4 && pqDataA->absLevel[2] < 4) {
       __m128i offsets = _mm_set_epi32(18 + pqDataA->absLevel[1], 12 + pqDataA->absLevel[1], 6 + pqDataA->absLevel[2], 0 + pqDataA->absLevel[2]);
-      __m128i coeff_frac_bits = _mm_i32gather_epi32(state->m_coeffFracBits[start], offsets, 1);
+      __m128i coeff_frac_bits = _mm_i32gather_epi32(state->m_coeffFracBits[start], offsets, 4);
       __m256i ext_frac_bits = _mm256_cvtepi32_epi64(coeff_frac_bits);
       rd_cost_b = _mm256_add_epi64(rd_cost_b, ext_frac_bits);
     } else if (pqDataA->absLevel[1] >= 4 && pqDataA->absLevel[2] >= 4) {
@@ -638,28 +634,28 @@ static void check_rd_costs_avx2(const all_depquant_states* const state, const en
       __m128i offsets = _mm_set_epi32(18 + pqDataA->absLevel[1], 12 + pqDataA->absLevel[1], 6 + pqDataA->absLevel[2], 0 + pqDataA->absLevel[2]);
       __m128i t = _mm_slli_epi32(value, 1);
       offsets = _mm_sub_epi32(offsets, t);
-      __m128i coeff_frac_bits = _mm_i32gather_epi32(state->m_coeffFracBits[start], offsets, 1);
+      __m128i coeff_frac_bits = _mm_i32gather_epi32(state->m_coeffFracBits[start], offsets, 4);
 
-      __m128i max_rice = _mm_set1_epi32(15);
+      __m128i max_rice = _mm_set1_epi32(31);
       value = _mm_min_epi32(value, max_rice);
       __m128i go_rice_tab = _mm_cvtepi8_epi32(_mm_loadu_si32(&state->m_goRicePar[start]));
       go_rice_tab = _mm_slli_epi32(go_rice_tab, 5);
       value = _mm_add_epi32(value, go_rice_tab);
 
-      __m128i temp = _mm_add_epi32(coeff_frac_bits, _mm_i32gather_epi32(&g_goRiceBits[0][0], value, 1));
+      __m128i temp = _mm_add_epi32(coeff_frac_bits, _mm_i32gather_epi32(&g_goRiceBits[0][0], value, 4));
       rd_cost_b = _mm256_add_epi64(rd_cost_b, _mm256_cvtepi32_epi64(temp));
     } else {
-      const int pqAs[4] = {0, 0, 3, 3};
+      const int pqBs[4] = {2, 2, 1, 1};
       int64_t rd_costs[4] = {0, 0, 0, 0}; 
       for (int i = 0; i < 4; i++) {
         const int      state_offset = start + i;
-        const int      pqA = pqAs[i];
+        const int      pqB = pqBs[i];
         const int32_t* goRiceTab = g_goRiceBits[state->m_goRicePar[state_offset]];
-        if (pqDataA->absLevel[pqA] < 4) {
-          rd_costs[i] = state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA]];
+        if (pqDataA->absLevel[pqB] < 4) {
+          rd_costs[i] = state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqB]];
         } else {
-          const coeff_t value = (pqDataA->absLevel[pqA] - 4) >> 1;
-          rd_costs[i] += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA] - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1];
+          const coeff_t value = (pqDataA->absLevel[pqB] - 4) >> 1;
+          rd_costs[i] += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqB] - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1];
         }
       }
       rd_cost_b = _mm256_add_epi64(rd_cost_b, _mm256_loadu_si256(&rd_costs[0]));
@@ -672,7 +668,7 @@ static void check_rd_costs_avx2(const all_depquant_states* const state, const en
       __m256i even = _mm256_permutevar8x32_epi32(original, even_mask);
       __m256i odd = _mm256_permutevar8x32_epi32(original, odd_mask);
       __m256i even_64 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(even, 0));
-      __m256i odd_64 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(odd, 1));
+      __m256i odd_64 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(odd, 0));
       rd_cost_a = _mm256_add_epi64(rd_cost_a, odd_64);
       rd_cost_b = _mm256_add_epi64(rd_cost_b, odd_64);
       rd_cost_z = _mm256_add_epi64(rd_cost_z, even_64);
@@ -683,11 +679,11 @@ static void check_rd_costs_avx2(const all_depquant_states* const state, const en
       __m256i even = _mm256_permutevar8x32_epi32(original, even_mask);
       __m256i odd = _mm256_permutevar8x32_epi32(original, odd_mask);
       __m256i m_sigFracBits_0 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(even, 0));
-      __m256i m_sigFracBits_1 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(odd, 1));
+      __m256i m_sigFracBits_1 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(odd, 0));
 
       original = _mm256_loadu_si256((__m256i const*)state->m_sbbFracBits[start]);
       odd = _mm256_permutevar8x32_epi32(original, odd_mask);
-      __m256i m_sbbFracBits_1 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(odd, 1));
+      __m256i m_sbbFracBits_1 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(odd, 0));
 
       
       rd_cost_a = _mm256_add_epi64(rd_cost_a, m_sbbFracBits_1);
@@ -706,22 +702,26 @@ static void check_rd_costs_avx2(const all_depquant_states* const state, const en
         __m256i even = _mm256_permutevar8x32_epi32(original, even_mask);
         __m256i odd = _mm256_permutevar8x32_epi32(original, odd_mask);
         __m256i even_64 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(even, 0));
-        __m256i odd_64 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(odd, 1));
+        __m256i odd_64 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(odd, 0));
         rd_cost_a = _mm256_add_epi64(rd_cost_a, odd_64);
         rd_cost_b = _mm256_add_epi64(rd_cost_b, odd_64);
-        rd_cost_z = _mm256_add_epi64(rd_cost_z, even_64);        
+        rd_cost_z = _mm256_add_epi64(rd_cost_z, even_64);
+        _mm256_storeu_epi64(temp_rd_cost_a, rd_cost_a);
+        _mm256_storeu_epi64(temp_rd_cost_b, rd_cost_b);
+        _mm256_storeu_epi64(temp_rd_cost_z, rd_cost_z);      
       }
       else if (!state->m_numSigSbb[start] && !state->m_numSigSbb[start + 1] && !state->m_numSigSbb[start + 2] && !state->m_numSigSbb[start + 3]) {
-        rd_cost_z = _mm256_setr_epi64x(decisions->rdCost[3], decisions->rdCost[3], decisions->rdCost[0], decisions->rdCost[0]);
+        rd_cost_z = _mm256_setr_epi64x(decisions->rdCost[0], decisions->rdCost[0], decisions->rdCost[3], decisions->rdCost[3]);
+        _mm256_storeu_epi64(temp_rd_cost_a, rd_cost_a);
+        _mm256_storeu_epi64(temp_rd_cost_b, rd_cost_b);
+        _mm256_storeu_epi64(temp_rd_cost_z, rd_cost_z);
       }
 
       else {
         const int pqAs[4] = {0, 0, 3, 3};
-        int64_t temp_rd_cost_a[4] = {0, 0, 0, 0}; 
-        int64_t temp_rd_cost_b[4] = {0, 0, 0, 0}; 
-        int64_t temp_rd_cost_z[4] = {0, 0, 0, 0}; 
-        int64_t z_out[4] = {0, 0, 0, 0};
-        _mm256_storeu_epi64(z_out, rd_cost_z);
+        _mm256_storeu_epi64(temp_rd_cost_a, rd_cost_a);
+        _mm256_storeu_epi64(temp_rd_cost_b, rd_cost_b);
+        _mm256_storeu_epi64(temp_rd_cost_z, rd_cost_z);
         for (int i = 0; i < 4; i++) {
           const int state_offset = start + i;
           if (state->m_numSigSbb[state_offset]) {
@@ -729,13 +729,9 @@ static void check_rd_costs_avx2(const all_depquant_states* const state, const en
             temp_rd_cost_b[i] += state->m_sigFracBits[state_offset][1];
             temp_rd_cost_z[i] += state->m_sigFracBits[state_offset][0];
           } else {
-            z_out[i] = decisions->rdCost[pqAs[i]];
+            temp_rd_cost_z[i] = decisions->rdCost[pqAs[i]];
           }
         }
-        rd_cost_z = _mm256_loadu_epi64(z_out);
-        rd_cost_a = _mm256_add_epi64(rd_cost_a, _mm256_loadu_epi64(temp_rd_cost_a));
-        rd_cost_b = _mm256_add_epi64(rd_cost_b, _mm256_loadu_epi64(temp_rd_cost_b));
-        rd_cost_z = _mm256_add_epi64(rd_cost_z, _mm256_loadu_epi64(temp_rd_cost_z));
       }
     }
     _mm256_storeu_epi64(temp_rd_cost_a, rd_cost_a);
@@ -743,25 +739,25 @@ static void check_rd_costs_avx2(const all_depquant_states* const state, const en
     _mm256_storeu_epi64(temp_rd_cost_z, rd_cost_z);
   } else if (all_under_four) {
     __m128i scale_bits = _mm_set1_epi32(1 << SCALE_BITS);
-    __m128i max_rice = _mm_set1_epi32(15);
+    __m128i max_rice = _mm_set1_epi32(31);
     __m128i go_rice_zero = _mm_cvtepi8_epi32(_mm_loadu_epi8(&state->m_goRiceZero[start]));
     // RD cost A
     {
       __m128i pq_abs_a = _mm_set_epi32(pqDataA->absLevel[3], pqDataA->absLevel[3], pqDataA->absLevel[0], pqDataA->absLevel[0]);
-      __m128i cmp = _mm_cmplt_epi32(go_rice_zero, pq_abs_a);
+      __m128i cmp = _mm_cmpgt_epi32(pq_abs_a, go_rice_zero);
       
       __m128i go_rice_smaller = _mm_min_epi32(pq_abs_a, max_rice);
 
       __m128i other = _mm_sub_epi32(pq_abs_a, _mm_set1_epi32(1));
 
-      __m128i selected = _mm_blendv_epi8(go_rice_smaller, other, cmp);
+      __m128i selected = _mm_blendv_epi8(other, go_rice_smaller, cmp);
 
 
       __m128i go_rice_offset = _mm_cvtepi8_epi32(_mm_loadu_si32(&state->m_goRicePar[start]));
       go_rice_offset = _mm_slli_epi32(go_rice_offset, 5);
 
       __m128i offsets = _mm_add_epi32(selected, go_rice_offset);
-      __m128i go_rice_tab = _mm_i32gather_epi32(&g_goRiceBits[0][0], offsets, 1);
+      __m128i go_rice_tab = _mm_i32gather_epi32(&g_goRiceBits[0][0], offsets, 4);
       __m128i temp = _mm_add_epi32(go_rice_tab, scale_bits);
 
       rd_cost_a = _mm256_add_epi64(rd_cost_a, _mm256_cvtepi32_epi64(temp));
@@ -769,20 +765,20 @@ static void check_rd_costs_avx2(const all_depquant_states* const state, const en
     // RD cost b
     {
       __m128i pq_abs_b = _mm_set_epi32(pqDataA->absLevel[1], pqDataA->absLevel[1], pqDataA->absLevel[2], pqDataA->absLevel[2]);
-      __m128i cmp = _mm_cmplt_epi32(go_rice_zero, pq_abs_b);
+      __m128i cmp = _mm_cmpgt_epi32(pq_abs_b, go_rice_zero);
 
       __m128i go_rice_smaller = _mm_min_epi32(pq_abs_b, max_rice);
 
       __m128i other = _mm_sub_epi32(pq_abs_b, _mm_set1_epi32(1));
 
-      __m128i selected = _mm_blendv_epi8(go_rice_smaller, other, cmp);
+      __m128i selected = _mm_blendv_epi8(other, go_rice_smaller, cmp);
 
 
       __m128i go_rice_offset = _mm_cvtepi8_epi32(_mm_loadu_si32(&state->m_goRicePar[start]));
       go_rice_offset = _mm_slli_epi32(go_rice_offset, 5);
 
       __m128i offsets = _mm_add_epi32(selected, go_rice_offset);
-      __m128i go_rice_tab = _mm_i32gather_epi32(&g_goRiceBits[0][0], offsets, 1);
+      __m128i go_rice_tab = _mm_i32gather_epi32(&g_goRiceBits[0][0], offsets, 4);
       __m128i temp = _mm_add_epi32(go_rice_tab, scale_bits);
 
       rd_cost_b = _mm256_add_epi64(rd_cost_b, _mm256_cvtepi32_epi64(temp));
@@ -793,7 +789,8 @@ static void check_rd_costs_avx2(const all_depquant_states* const state, const en
       go_rice_offset = _mm_slli_epi32(go_rice_offset, 5);
 
       go_rice_offset = _mm_add_epi32(go_rice_offset, go_rice_zero);
-      rd_cost_z = _mm256_add_epi64(rd_cost_z, _mm256_cvtepi32_epi64(go_rice_offset));
+      __m128i go_rice_tab = _mm_i32gather_epi32(&g_goRiceBits[0][0], go_rice_offset, 4);
+      rd_cost_z = _mm256_add_epi64(rd_cost_z, _mm256_cvtepi32_epi64(go_rice_tab));
     }
     _mm256_storeu_epi64(temp_rd_cost_a, rd_cost_a);
     _mm256_storeu_epi64(temp_rd_cost_b, rd_cost_b);
@@ -868,7 +865,7 @@ static void check_rd_costs_avx2(const all_depquant_states* const state, const en
     decisions->prevId[0] = state->m_stateId[start + 1];    
   }
 
-  // Decision 1
+  // Decision 2
   if (temp_rd_cost_a[1] < decisions->rdCost[2]) {
     decisions->rdCost[2] = temp_rd_cost_a[1];
     decisions->absLevel[2] = pqDataA->absLevel[0];
@@ -885,35 +882,35 @@ static void check_rd_costs_avx2(const all_depquant_states* const state, const en
     decisions->prevId[2] = state->m_stateId[start];    
   }
 
-  // Decision 2
-  if (temp_rd_cost_a[2] < decisions->rdCost[0]) {
-    decisions->rdCost[2] = temp_rd_cost_a[2];
-    decisions->absLevel[2] = pqDataA->absLevel[3];
-    decisions->prevId[2] = state->m_stateId[start + 2];    
+  // Decision 1
+  if (temp_rd_cost_a[2] < decisions->rdCost[1]) {
+    decisions->rdCost[1] = temp_rd_cost_a[2];
+    decisions->absLevel[1] = pqDataA->absLevel[3];
+    decisions->prevId[1] = state->m_stateId[start + 2];    
   }
-  if (temp_rd_cost_z[2] < decisions->rdCost[0]) {
-    decisions->rdCost[2] = temp_rd_cost_z[2];
-    decisions->absLevel[2] = 0;
-    decisions->prevId[2] = state->m_stateId[start + 2];    
+  if (temp_rd_cost_z[2] < decisions->rdCost[1]) {
+    decisions->rdCost[1] = temp_rd_cost_z[2];
+    decisions->absLevel[1] = 0;
+    decisions->prevId[1] = state->m_stateId[start + 2];    
   }
-  if (temp_rd_cost_b[3] < decisions->rdCost[0]) {
-    decisions->rdCost[2] = temp_rd_cost_b[3];
-    decisions->absLevel[2] = pqDataA->absLevel[1];
-    decisions->prevId[2] = state->m_stateId[start + 3];    
+  if (temp_rd_cost_b[3] < decisions->rdCost[1]) {
+    decisions->rdCost[1] = temp_rd_cost_b[3];
+    decisions->absLevel[1] = pqDataA->absLevel[1];
+    decisions->prevId[1] = state->m_stateId[start + 3];    
   }
 
   // Decision 3
-  if (temp_rd_cost_a[3] < decisions->rdCost[1]) {
+  if (temp_rd_cost_a[3] < decisions->rdCost[3]) {
     decisions->rdCost[3] = temp_rd_cost_a[3];
     decisions->absLevel[3] = pqDataA->absLevel[3];
     decisions->prevId[3] = state->m_stateId[start + 3];    
   }
-  if (temp_rd_cost_z[3] < decisions->rdCost[1]) {
+  if (temp_rd_cost_z[3] < decisions->rdCost[3]) {
     decisions->rdCost[3] = temp_rd_cost_z[3];
     decisions->absLevel[3] = 0;
     decisions->prevId[3] = state->m_stateId[start + 3];    
   }
-  if (temp_rd_cost_b[2] < decisions->rdCost[1]) {
+  if (temp_rd_cost_b[2] < decisions->rdCost[3]) {
     decisions->rdCost[3] = temp_rd_cost_b[2];
     decisions->absLevel[3] = pqDataA->absLevel[1];
     decisions->prevId[3] = state->m_stateId[start + 2];    

From 8b19c468cff2057cc0e9c196916ff6f74733d1bd Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Sun, 9 Apr 2023 14:05:50 +0300
Subject: [PATCH 211/254] [avx2] check_rd_costs_avx2 done

---
 CMakeLists.txt  | 1 +
 src/dep_quant.c | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d8c37bbc..6460743b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -144,6 +144,7 @@ target_include_directories(uvg266 PUBLIC src/extras)
 target_include_directories(uvg266 PUBLIC src/strategies)
 
 file(GLOB LIB_SOURCES_STRATEGIES_AVX2 RELATIVE ${PROJECT_SOURCE_DIR} "src/strategies/avx2/*.c")
+file(GLOB LIB_SOURCES_STRATEGIES_AVX2 RELATIVE ${PROJECT_SOURCE_DIR} "src/dep_quant.c")
 file(GLOB LIB_SOURCES_STRATEGIES_SSE41 RELATIVE ${PROJECT_SOURCE_DIR} "src/strategies/sse41/*.c")
 file(GLOB LIB_SOURCES_STRATEGIES_SSE42 RELATIVE ${PROJECT_SOURCE_DIR} "src/strategies/sse42/*.c")
 
diff --git a/src/dep_quant.c b/src/dep_quant.c
index f272ad6e..932a12ca 100644
--- a/src/dep_quant.c
+++ b/src/dep_quant.c
@@ -601,7 +601,7 @@ static void check_rd_costs_avx2(const all_depquant_states* const state, const en
       __m128i max_rice = _mm_set1_epi32(31);
       value = _mm_min_epi32(value, max_rice);
       __m128i go_rice_tab = _mm_cvtepi8_epi32(_mm_loadu_si32(&state->m_goRicePar[start]));
-      go_rice_tab = _mm_slli_epi32(value, 5);
+      go_rice_tab = _mm_slli_epi32(go_rice_tab, 5);
       value = _mm_add_epi32(value, go_rice_tab);
 
       __m128i temp = _mm_add_epi32(coeff_frac_bits, _mm_i32gather_epi32(&g_goRiceBits[0][0], value, 4));

From 04be92a8ec90968aa760a8bf8f6df2f8893265df Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Mon, 10 Apr 2023 08:40:35 +0300
Subject: [PATCH 212/254] [avx2] simplify

---
 src/dep_quant.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/dep_quant.c b/src/dep_quant.c
index 932a12ca..ffd352b7 100644
--- a/src/dep_quant.c
+++ b/src/dep_quant.c
@@ -852,68 +852,68 @@ static void check_rd_costs_avx2(const all_depquant_states* const state, const en
   if (temp_rd_cost_a[0] < decisions->rdCost[0]) {
     decisions->rdCost[0] = temp_rd_cost_a[0];
     decisions->absLevel[0] = pqDataA->absLevel[0];
-    decisions->prevId[0] = state->m_stateId[start];    
+    decisions->prevId[0] = 0;    
   }
   if (temp_rd_cost_z[0] < decisions->rdCost[0]) {
     decisions->rdCost[0] = temp_rd_cost_z[0];
     decisions->absLevel[0] = 0;
-    decisions->prevId[0] = state->m_stateId[start];    
+    decisions->prevId[0] = 0;    
   }
   if (temp_rd_cost_b[1] < decisions->rdCost[0]) {
     decisions->rdCost[0] = temp_rd_cost_b[1];
     decisions->absLevel[0] = pqDataA->absLevel[2];
-    decisions->prevId[0] = state->m_stateId[start + 1];    
+    decisions->prevId[0] = 1;    
   }
 
   // Decision 2
   if (temp_rd_cost_a[1] < decisions->rdCost[2]) {
     decisions->rdCost[2] = temp_rd_cost_a[1];
     decisions->absLevel[2] = pqDataA->absLevel[0];
-    decisions->prevId[2] = state->m_stateId[start + 1];    
+    decisions->prevId[2] =1;    
   }
   if (temp_rd_cost_z[1] < decisions->rdCost[2]) {
     decisions->rdCost[2] = temp_rd_cost_z[1];
     decisions->absLevel[2] = 0;
-    decisions->prevId[2] = state->m_stateId[start + 1];    
+    decisions->prevId[2] = 1;    
   }
   if (temp_rd_cost_b[0] < decisions->rdCost[2]) {
     decisions->rdCost[2] = temp_rd_cost_b[0];
     decisions->absLevel[2] = pqDataA->absLevel[2];
-    decisions->prevId[2] = state->m_stateId[start];    
+    decisions->prevId[2] = 0;    
   }
 
   // Decision 1
   if (temp_rd_cost_a[2] < decisions->rdCost[1]) {
     decisions->rdCost[1] = temp_rd_cost_a[2];
     decisions->absLevel[1] = pqDataA->absLevel[3];
-    decisions->prevId[1] = state->m_stateId[start + 2];    
+    decisions->prevId[1] = 2;    
   }
   if (temp_rd_cost_z[2] < decisions->rdCost[1]) {
     decisions->rdCost[1] = temp_rd_cost_z[2];
     decisions->absLevel[1] = 0;
-    decisions->prevId[1] = state->m_stateId[start + 2];    
+    decisions->prevId[1] = 2;    
   }
   if (temp_rd_cost_b[3] < decisions->rdCost[1]) {
     decisions->rdCost[1] = temp_rd_cost_b[3];
     decisions->absLevel[1] = pqDataA->absLevel[1];
-    decisions->prevId[1] = state->m_stateId[start + 3];    
+    decisions->prevId[1] = 3;    
   }
 
   // Decision 3
   if (temp_rd_cost_a[3] < decisions->rdCost[3]) {
     decisions->rdCost[3] = temp_rd_cost_a[3];
     decisions->absLevel[3] = pqDataA->absLevel[3];
-    decisions->prevId[3] = state->m_stateId[start + 3];    
+    decisions->prevId[3] = 3;    
   }
   if (temp_rd_cost_z[3] < decisions->rdCost[3]) {
     decisions->rdCost[3] = temp_rd_cost_z[3];
     decisions->absLevel[3] = 0;
-    decisions->prevId[3] = state->m_stateId[start + 3];    
+    decisions->prevId[3] = 3;    
   }
   if (temp_rd_cost_b[2] < decisions->rdCost[3]) {
     decisions->rdCost[3] = temp_rd_cost_b[2];
     decisions->absLevel[3] = pqDataA->absLevel[1];
-    decisions->prevId[3] = state->m_stateId[start + 2];    
+    decisions->prevId[3] = 2;    
   }
 }
 

From 58a66c06545c4c1a9812a8ab77165b0a61570a4d Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Mon, 10 Apr 2023 15:31:05 +0300
Subject: [PATCH 213/254] [avx2] WIP update_states_avx2

---
 src/dep_quant.c | 609 +++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 531 insertions(+), 78 deletions(-)

diff --git a/src/dep_quant.c b/src/dep_quant.c
index ffd352b7..78a039bb 100644
--- a/src/dep_quant.c
+++ b/src/dep_quant.c
@@ -157,7 +157,7 @@ typedef struct
   int8_t m_goRicePar[12];
   int8_t m_goRiceZero[12];
   int8_t m_stateId[12];
-  uint32_t *m_sigFracBitsArray[12][12];
+  uint32_t m_sigFracBitsArray[12][12][2];
   int32_t *m_gtxFracBitsArray[21];
   common_context* m_commonCtx;
 
@@ -1240,6 +1240,510 @@ static INLINE void updateStateEOS(
            state->m_gtxFracBitsArray[gtxCtxOffsetNext + (sumGt1 < 4 ? sumGt1 : 4)], sizeof(state->m_coeffFracBits[0]));
   }
 }
+static INLINE void updateState(
+  context_store*  ctxs,
+  int             numIPos,
+  const uint32_t  scan_pos,
+  const Decision* decisions,
+  const uint32_t  sigCtxOffsetNext,
+  const uint32_t  gtxCtxOffsetNext,
+  const NbInfoSbb next_nb_info_ssb,
+  const int       baseLevel,
+  const bool      extRiceFlag,
+  int             decision_id);
+
+static INLINE void update_states_avx2(
+  context_store*  ctxs,
+  int             numIPos,
+  const uint32_t  scan_pos,
+  const Decision* decisions,
+  const uint32_t  sigCtxOffsetNext,
+  const uint32_t  gtxCtxOffsetNext,
+  const NbInfoSbb next_nb_info_ssb,
+  const int       baseLevel,
+  const bool      extRiceFlag)
+{
+  all_depquant_states* state = &ctxs->m_allStates;
+
+  bool all_non_negative = true;
+  bool all_above_minus_two = true;
+  bool all_minus_one = true;
+  for (int i = 0; i < 4; ++i) {
+    all_non_negative &= decisions->prevId[i] >= 0;
+    all_above_minus_two &= decisions->prevId[i] > -2;
+    all_minus_one &= decisions->prevId[i] == -1;
+  }
+  int state_offset = ctxs->m_curr_state_offset;
+  if (all_above_minus_two) {
+
+    bool    rem_reg_all_gte_4 = true;
+    bool    rem_reg_all_lt4 = true;
+
+    __m128i abs_level = _mm_loadu_epi16(decisions->absLevel);
+    abs_level = _mm_cvtepi16_epi32(abs_level);
+    if (all_non_negative) {
+      __m128i prv_states = _mm_loadu_epi32(decisions->prevId);
+      __m128i prev_offset = _mm_set1_epi32(ctxs->m_prev_state_offset);
+      prv_states = _mm_add_epi32(prv_states, prev_offset);
+
+
+      //__m128i num_sig_sbb = _mm_i32gather_epi32(state->m_numSigSbb, prv_states, 1);
+      //__m128 mask = _mm_set_epi32(0xff, 0xff, 0xff, 0xff);
+      //num_sig_sbb 
+
+
+      int32_t prv_states_scalar[4];
+      _mm_storeu_epi32(prv_states_scalar, prv_states);
+      int8_t sig_sbb[4] = {state->m_numSigSbb[prv_states_scalar[0]], state->m_numSigSbb[prv_states_scalar[1]], state->m_numSigSbb[prv_states_scalar[2]], state->m_numSigSbb[prv_states_scalar[3]]};
+      for (int i = 0; i < 4; ++i) {
+        sig_sbb[i] = sig_sbb[i] || decisions->absLevel[i];
+      }
+      memcpy(&state->m_numSigSbb[state_offset], sig_sbb, 4);
+
+      __m128i ref_sbb_ctx_idx = _mm_i32gather_epi32(state->m_refSbbCtxId, prv_states, 1);
+      __m128i control = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+      ref_sbb_ctx_idx = _mm_shuffle_epi8(ref_sbb_ctx_idx, control);
+      int ref_sbb_ctx = _mm_extract_epi32(ref_sbb_ctx_idx, 0);
+      memcpy(&state->m_refSbbCtxId[state_offset], &ref_sbb_ctx, 4);
+
+      __m128i go_rice_par = _mm_i32gather_epi32(state->m_goRicePar, prv_states, 1);
+      go_rice_par = _mm_shuffle_epi8(go_rice_par, control);
+      int go_rice_par_i = _mm_extract_epi32(go_rice_par, 0);
+      memcpy(&state->m_goRicePar[state_offset], &go_rice_par_i, 4);
+
+      
+      __m256i sbb_frac_bits = _mm256_i32gather_epi64(state->m_sbbFracBits, prv_states, 4);
+      _mm256_storeu_epi64(&state->m_sbbFracBits[state_offset][0], sbb_frac_bits);
+
+      __m128i rem_reg_bins = _mm_i32gather_epi32(state->m_remRegBins, prv_states, 4);
+      __m128i ones = _mm_set1_epi32(1);
+      rem_reg_bins = _mm_sub_epi32(rem_reg_bins, ones);
+
+      __m128i reg_bins_sub = _mm_set1_epi32(0);
+      __m128i abs_level_smaller_than_two = _mm_cmplt_epi32(abs_level, _mm_set1_epi32(2));
+      __m128i secondary = _mm_blendv_epi8(abs_level, _mm_set1_epi32(3), abs_level_smaller_than_two);
+
+      __m128i rem_reg_bins_smaller_than_four = _mm_cmplt_epi32(rem_reg_bins, _mm_set1_epi32(4));
+      reg_bins_sub = _mm_blendv_epi8(secondary, reg_bins_sub, rem_reg_bins_smaller_than_four);
+      rem_reg_bins = _mm_sub_epi32(rem_reg_bins, reg_bins_sub);
+      _mm_storeu_epi32(&state->m_remRegBins[state_offset], rem_reg_bins);
+
+      __m128i mask = _mm_cmpgt_epi32(rem_reg_bins, _mm_set1_epi32(3)); 
+      int     bit_mask = _mm_movemask_epi8(mask);           
+      rem_reg_all_gte_4 = (bit_mask == 0xFFFF);
+      mask = _mm_cmplt_epi32(rem_reg_bins, _mm_set1_epi32(4));
+      bit_mask = _mm_movemask_epi8(mask); 
+      rem_reg_all_lt4 = (bit_mask == 0xFFFF);
+
+      for (int i = 0; i < 4; ++i) {
+        memcpy(state->m_absLevelsAndCtxInit[i], state->m_absLevelsAndCtxInit[prv_states_scalar[i]], 48 * sizeof(uint8_t));        
+      }
+    }
+    else if (all_minus_one) {
+      memset(&state->m_numSigSbb[state_offset], 1, 4);
+      memset(&state->m_refSbbCtxId[state_offset], -1, 4);
+
+      const int a = (state->effWidth * state->effHeight * 28) / 16;
+
+      __m128i   rem_reg_bins = _mm_set1_epi32(a);
+      __m128i   sub = _mm_blendv_epi8(
+        abs_level,
+        _mm_set1_epi32(3),
+        _mm_cmplt_epi32(abs_level, _mm_set1_epi32(2))
+      );
+      rem_reg_bins = _mm_sub_epi32(rem_reg_bins, sub);
+      _mm_storeu_epi32(&state->m_remRegBins[state_offset], rem_reg_bins);
+
+      __m128i mask = _mm_cmpgt_epi32(rem_reg_bins, _mm_set1_epi32(3));
+      int     bit_mask = _mm_movemask_epi8(mask);
+      rem_reg_all_gte_4 = (bit_mask == 0xFFFF);
+      mask = _mm_cmplt_epi32(rem_reg_bins, _mm_set1_epi32(4));
+      bit_mask = _mm_movemask_epi8(mask);
+      rem_reg_all_lt4 = (bit_mask == 0xFFFF);
+      
+      memset(state->m_absLevelsAndCtxInit[state_offset], 0, 48 * sizeof(uint8_t) * 4);
+      
+    }
+    else {
+      for (int i = 0; i< 4; ++i) {
+        const int decision_id = i;
+        const int state_id = state_offset + i;
+        if (decisions->prevId[decision_id] >= 0) {
+          const int prvState = ctxs->m_prev_state_offset + decisions->prevId[decision_id];
+          state->m_numSigSbb[state_id] = (state->m_numSigSbb[prvState]) || !!decisions->absLevel[decision_id];
+          state->m_refSbbCtxId[state_id] = state->m_refSbbCtxId[prvState];
+          state->m_sbbFracBits[state_id][0] = state->m_sbbFracBits[prvState][0];
+          state->m_sbbFracBits[state_id][1] = state->m_sbbFracBits[prvState][1];
+          state->m_remRegBins[state_id] = state->m_remRegBins[prvState] - 1;
+          state->m_goRicePar[state_id] = state->m_goRicePar[prvState];
+          if (state->m_remRegBins[state_id] >= 4) {
+            state->m_remRegBins[state_id] -= (decisions->absLevel[decision_id] < 2 ? (unsigned)decisions->absLevel[decision_id] : 3);
+          }
+          memcpy(state->m_absLevelsAndCtxInit[state_id], state->m_absLevelsAndCtxInit[prvState], 48 * sizeof(uint8_t));
+        } else {
+          state->m_numSigSbb[state_id] = 1;
+          state->m_refSbbCtxId[state_id] = -1;
+          int ctxBinSampleRatio = 28;
+          //(scanInfo.chType == CHANNEL_TYPE_LUMA) ? MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_LUMA : MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_CHROMA;
+          state->m_remRegBins[state_id] = (state->effWidth * state->effHeight * ctxBinSampleRatio) / 16 - (decisions->absLevel[decision_id] < 2 ? (unsigned)decisions->absLevel[decision_id] : 3);
+          memset(state->m_absLevelsAndCtxInit[state_id], 0, 48 * sizeof(uint8_t));
+        }
+        rem_reg_all_gte_4 &= state->m_remRegBins[state_id] >= 4;
+        rem_reg_all_lt4 &= state->m_remRegBins[state_id] < 4;
+      }
+    }
+    uint32_t level_offset = scan_pos & 15;
+    __m128i   max_abs = _mm_min_epi32(abs_level, _mm_set1_epi32(255));
+    uint32_t max_abs_s[4];
+    _mm_storeu_epi32(max_abs_s, max_abs);
+    for (int i = 0; i < 4; ++i) {
+      uint8_t* levels = (uint8_t*)state->m_absLevelsAndCtxInit[state_offset + i];
+      levels[level_offset] = max_abs_s[i];
+    }
+
+    if (rem_reg_all_gte_4) {
+      const __m128i  last_two_bytes = _mm_set1_epi32(0xffff);
+      const __m128i  last_byte = _mm_set1_epi32(0xff);
+      const __m128i  ones = _mm_set1_epi32(1);
+      const uint32_t tinit_offset = MIN(level_offset - 1u, 15u) + 8;
+      const __m128i levels_start_offsets = _mm_set_epi32(48 * 3, 48 * 2, 48 * 1, 48 * 0);
+      __m128i        tinit = _mm_i32gather_epi32(
+        state->m_absLevelsAndCtxInit[state_offset],
+        _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(tinit_offset)),
+        1);
+      tinit = _mm_and_epi32(tinit, last_two_bytes);
+      __m128i sum_abs1 = _mm_and_epi32(_mm_srli_epi32(tinit, 3), _mm_set1_epi32(31));
+      __m128i sum_num = _mm_and_epi32(tinit, _mm_set1_epi32(7));
+
+      uint8_t* levels = state->m_absLevelsAndCtxInit[state_offset];
+      switch (numIPos) {
+      case 5:
+        {
+          __m128i t = _mm_i32gather_epi32(
+            levels,
+            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[4])),
+            1);
+          sum_abs1 = _mm_add_epi32(
+            sum_abs1,
+            _mm_and_epi32(t, ones));
+          sum_num = _mm_add_epi32(
+            sum_num,
+            _mm_min_epi32(_mm_and_epi32(t, last_byte), ones));
+        }
+      case 4:
+        {
+          __m128i t = _mm_i32gather_epi32(
+            levels,
+            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[3])),
+            1);
+          sum_abs1 = _mm_add_epi32(
+            sum_abs1,
+            _mm_and_epi32(t, ones));
+          sum_num = _mm_add_epi32(
+            sum_num,
+            _mm_min_epi32(_mm_and_epi32(t, last_byte), ones));
+        }
+      case 3:
+        {
+          __m128i t = _mm_i32gather_epi32(
+            levels,
+            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[2])),
+            1);
+          sum_abs1 = _mm_add_epi32(
+            sum_abs1,
+            _mm_and_epi32(t, ones));
+          sum_num = _mm_add_epi32(
+            sum_num,
+            _mm_min_epi32(_mm_and_epi32(t, last_byte), ones));
+        }
+      case 2:
+        {
+          __m128i t = _mm_i32gather_epi32(
+            levels,
+            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[1])),
+            1);
+          sum_abs1 = _mm_add_epi32(
+            sum_abs1,
+            _mm_and_epi32(t, ones));
+          sum_num = _mm_add_epi32(
+            sum_num,
+            _mm_min_epi32(_mm_and_epi32(t, last_byte), ones));
+        }
+      case 1: {
+          __m128i t = _mm_i32gather_epi32(
+            levels,
+            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[0])),
+            1);
+          sum_abs1 = _mm_add_epi32(
+            sum_abs1,
+            _mm_and_epi32(t, ones));
+          sum_num = _mm_add_epi32(
+            sum_num,
+            _mm_min_epi32(_mm_and_epi32(t, last_byte), ones));
+        } break;
+      default:
+          assert(0);
+      }
+      __m128i sum_gt1 = _mm_sub_epi32(sum_abs1, sum_num);
+      __m128i  offsets = _mm_set_epi32(24 * 3, 24 * 2, 24 * 1, 24 * 0);
+      offsets = _mm_add_epi32(offsets, _mm_set1_epi32(sigCtxOffsetNext));
+      __m128i temp = _mm_min_epi32(
+        _mm_srli_epi32(_mm_add_epi32(sum_abs1, ones), 1),
+        _mm_set1_epi32(3));
+      offsets = _mm_add_epi32(offsets, temp);
+      __m256i sig_frac_bits = _mm256_i32gather_epi64(state->m_sigFracBitsArray[state_offset][0], offsets, 4);
+      _mm256_storeu_epi64(&state->m_sigFracBits[state_offset][0], sig_frac_bits);
+
+      sum_gt1 = _mm_min_epi32(sum_gt1, _mm_set1_epi32(4));
+      uint32_t sum_gt1_s[4];
+      _mm_storeu_epi32(sum_gt1_s, sum_gt1);
+      for (int i = 0; i < 4; ++i) {
+        memcpy(state->m_coeffFracBits[state_offset + i], state->m_gtxFracBitsArray[sum_gt1_s[i]], sizeof(state->m_coeffFracBits[0]));
+      }
+
+      __m128i sum_abs = _mm_srli_epi32(tinit, 8);
+      switch (numIPos) {
+        case 5:
+          {
+          __m128i t = _mm_i32gather_epi32(
+            levels,
+            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[4])),
+            1);
+            t = _mm_and_epi32(t, last_byte);
+          sum_abs = _mm_add_epi32(sum_abs, t);
+          }
+        case 4:
+          {
+          __m128i t = _mm_i32gather_epi32(
+            levels,
+            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[3])),
+            1);
+            t = _mm_and_epi32(t, last_byte);
+          sum_abs = _mm_add_epi32(sum_abs, t);
+          }
+        case 3:
+          {
+          __m128i t = _mm_i32gather_epi32(
+            levels,
+            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[2])),
+            1);
+            t = _mm_and_epi32(t, last_byte);
+          sum_abs = _mm_add_epi32(sum_abs, t);
+          }
+        case 2:
+          {
+          __m128i t = _mm_i32gather_epi32(
+            levels,
+            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[1])),
+            1);
+            t = _mm_and_epi32(t, last_byte);
+          sum_abs = _mm_add_epi32(sum_abs, t);
+          }
+        case 1:
+          {
+          __m128i t = _mm_i32gather_epi32(
+            levels,
+            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[0])),
+            1);
+            t = _mm_and_epi32(t, last_byte);
+          sum_abs = _mm_add_epi32(sum_abs, t);
+          } break;
+        default:
+          assert(0);
+      }
+      if (extRiceFlag) {
+        assert(0 && "Not implemented for avx2");
+      } else {
+        __m128i sum_all = _mm_max_epi32(
+          _mm_min_epi32(
+            _mm_set1_epi32(31),
+            _mm_sub_epi32(sum_abs, _mm_set1_epi32(20))),
+          _mm_set1_epi32(0));
+        __m128i temp = _mm_i32gather_epi32(g_goRiceParsCoeff, sum_all, 4);
+        _mm_storeu_epi32(&state->m_goRicePar[state_offset], temp);
+      }
+    }
+
+    else if (rem_reg_all_lt4) {
+      uint8_t*       levels = state->m_absLevelsAndCtxInit[state_offset];
+      const __m128i  last_two_bytes = _mm_set1_epi32(0xffff);
+      const __m128i  last_byte = _mm_set1_epi32(0xff);
+      const __m128i  ones = _mm_set1_epi32(1);
+      const uint32_t tinit_offset = MIN(level_offset - 1u, 15u) + 8;
+      const __m128i levels_start_offsets = _mm_set_epi32(48 * 3, 48 * 2, 48 * 1, 48 * 0);
+      __m128i       tinit = _mm_i32gather_epi32(
+        state->m_absLevelsAndCtxInit[state_offset],
+        _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(tinit_offset)),
+        1);
+      tinit = _mm_and_epi32(tinit, last_two_bytes);
+      __m128i sum_abs = _mm_srli_epi32(tinit, 8);
+      switch (numIPos) {
+        case 5: {
+          __m128i t = _mm_i32gather_epi32(
+            levels,
+            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[4])),
+            1);
+          t = _mm_and_epi32(t, last_byte);
+          sum_abs = _mm_add_epi32(sum_abs, t);
+        }
+        case 4: {
+          __m128i t = _mm_i32gather_epi32(
+            levels,
+            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[3])),
+            1);
+          t = _mm_and_epi32(t, last_byte);
+          sum_abs = _mm_add_epi32(sum_abs, t);
+        }
+        case 3: {
+          __m128i t = _mm_i32gather_epi32(
+            levels,
+            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[2])),
+            1);
+          t = _mm_and_epi32(t, last_byte);
+          sum_abs = _mm_add_epi32(sum_abs, t);
+        }
+        case 2: {
+          __m128i t = _mm_i32gather_epi32(
+            levels,
+            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[1])),
+            1);
+          t = _mm_and_epi32(t, last_byte);
+          sum_abs = _mm_add_epi32(sum_abs, t);
+        }
+        case 1: {
+          __m128i t = _mm_i32gather_epi32(
+            levels,
+            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[0])),
+            1);
+          t = _mm_and_epi32(t, last_byte);
+          sum_abs = _mm_add_epi32(sum_abs, t);
+        } break;
+        default:
+          assert(0);
+      }
+      if (extRiceFlag) {
+        assert(0 && "Not implemented for avx2");
+      } else {
+        __m128i sum_all = _mm_max_epi32(
+          _mm_min_epi32(
+            _mm_set1_epi32(31),
+            _mm_sub_epi32(sum_abs, _mm_set1_epi32(20))),
+          _mm_set1_epi32(0));
+        __m128i temp = _mm_i32gather_epi32(g_goRiceParsCoeff, sum_all, 4);
+        __m128i control = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+        __m128i go_rice_par = _mm_shuffle_epi8(temp, control);
+        int     go_rice_par_i = _mm_extract_epi32(go_rice_par, 0);
+        memcpy(&state->m_goRicePar[state_offset], &go_rice_par_i, 4);
+
+        __m128i go_rice_zero = _mm_set_epi32(2, 2, 1, 1);
+        go_rice_zero = _mm_sll_epi32(go_rice_zero, temp);
+        go_rice_zero = _mm_shuffle_epi8(go_rice_zero, control);
+        int go_rice_zero_i = _mm_extract_epi32(go_rice_par, 0);
+        memcpy(&state->m_goRiceZero[state_offset], &go_rice_zero_i, 4);
+      }
+
+    }
+    else {
+      for (int i = 0; i < 4; ++i) {
+        const int state_id = state_offset + i;
+        uint8_t*  levels = (uint8_t*)(state->m_absLevelsAndCtxInit[state_id]);
+        if (state->m_remRegBins[state_id] >= 4) {
+          coeff_t tinit = state->m_absLevelsAndCtxInit[state_id][8 + ((scan_pos - 1) & 15)];
+          coeff_t sumAbs1 = (tinit >> 3) & 31;
+          coeff_t sumNum = tinit & 7;
+#define UPDATE(k)                                  \
+  {                                                \
+    coeff_t t = levels[next_nb_info_ssb.inPos[k]]; \
+    sumAbs1 += MIN(4 + (t & 1), t);                \
+    sumNum += !!t;                                 \
+  }
+          switch (numIPos) {
+            case 5: UPDATE(4);
+            case 4: UPDATE(3);
+            case 3: UPDATE(2);
+            case 2: UPDATE(1);
+            case 1: UPDATE(0); break;
+            default: assert(0);
+          }
+#undef UPDATE
+          coeff_t sumGt1 = sumAbs1 - sumNum;
+          state->m_sigFracBits[state_id][0] = state->m_sigFracBitsArray[state_id][sigCtxOffsetNext + MIN((sumAbs1 + 1) >> 1, 3)][0];
+          state->m_sigFracBits[state_id][1] = state->m_sigFracBitsArray[state_id][sigCtxOffsetNext + MIN((sumAbs1 + 1) >> 1, 3)][1];
+          memcpy(state->m_coeffFracBits[state_id], state->m_gtxFracBitsArray[gtxCtxOffsetNext + (sumGt1 < 4 ? sumGt1 : 4)], sizeof(state->m_coeffFracBits[0]));
+
+
+          coeff_t sumAbs = state->m_absLevelsAndCtxInit[state_id][8 + ((scan_pos - 1) & 15)] >> 8;
+#define UPDATE(k)                                  \
+  {                                                \
+    coeff_t t = levels[next_nb_info_ssb.inPos[k]]; \
+    sumAbs += t;                                   \
+  }
+          switch (numIPos) {
+            case 5: UPDATE(4);
+            case 4: UPDATE(3);
+            case 3: UPDATE(2);
+            case 2: UPDATE(1);
+            case 1: UPDATE(0); break;
+            default: assert(0);
+          }
+#undef UPDATE
+          if (extRiceFlag) {
+            unsigned currentShift = templateAbsCompare(sumAbs);
+            sumAbs = sumAbs >> currentShift;
+            int sumAll = MAX(MIN(31, (int)sumAbs - (int)baseLevel), 0);
+            state->m_goRicePar[state_id] = g_goRiceParsCoeff[sumAll];
+            state->m_goRicePar[state_id] += currentShift;
+          } else {
+            int sumAll = MAX(MIN(31, (int)sumAbs - 4 * 5), 0);
+            state->m_goRicePar[state_id] = g_goRiceParsCoeff[sumAll];
+          }
+        } else {
+          coeff_t sumAbs = (state->m_absLevelsAndCtxInit[state_id][8 + ((scan_pos - 1) & 15)]) >> 8;
+#define UPDATE(k)                                  \
+  {                                                \
+    coeff_t t = levels[next_nb_info_ssb.inPos[k]]; \
+    sumAbs += t;                                   \
+  }
+          switch (numIPos) {
+            case 5: UPDATE(4);
+            case 4: UPDATE(3);
+            case 3: UPDATE(2);
+            case 2: UPDATE(1);
+            case 1: UPDATE(0); break;
+            default: assert(0);
+          }
+#undef UPDATE
+          if (extRiceFlag) {
+            unsigned currentShift = templateAbsCompare(sumAbs);
+            sumAbs = sumAbs >> currentShift;
+            sumAbs = MIN(31, sumAbs);
+            state->m_goRicePar[state_id] = g_goRiceParsCoeff[sumAbs];
+            state->m_goRicePar[state_id] += currentShift;
+          } else {
+            sumAbs = MIN(31, sumAbs);
+            state->m_goRicePar[state_id] = g_goRiceParsCoeff[sumAbs];
+          }
+          state->m_goRiceZero[state_id] = ((state_id & 3) < 2 ? 1 : 2) << state->m_goRicePar[state_id];
+        }
+      }
+    }
+  } else {
+    for (int i = 0; i < 4; ++i) {
+      updateState(
+        ctxs,
+        numIPos,
+        scan_pos,
+        decisions,
+        sigCtxOffsetNext,
+        gtxCtxOffsetNext,
+        next_nb_info_ssb,
+        baseLevel,
+        extRiceFlag,
+        i);
+    }
+  }
+}
+
 
 static INLINE void updateState(
   context_store * ctxs,
@@ -1258,7 +1762,7 @@ static INLINE void updateState(
   if (decisions->prevId[decision_id] > -2) {
     if (decisions->prevId[decision_id] >= 0) {
       const int prvState = ctxs->m_prev_state_offset + decisions->prevId[decision_id];
-      state->m_numSigSbb[state_id] = (state->m_numSigSbb[prvState]) + !!decisions->absLevel[decision_id];
+      state->m_numSigSbb[state_id] = (state->m_numSigSbb[prvState]) || !!decisions->absLevel[decision_id];
       state->m_refSbbCtxId[state_id] = state->m_refSbbCtxId[prvState];
       state->m_sbbFracBits[state_id][0] = state->m_sbbFracBits[prvState][0];
       state->m_sbbFracBits[state_id][1] = state->m_sbbFracBits[prvState][1];
@@ -1289,30 +1793,13 @@ static INLINE void updateState(
       coeff_t sumAbs1 = (tinit >> 3) & 31;
       coeff_t sumNum = tinit & 7;
 #define UPDATE(k) {coeff_t t=levels[next_nb_info_ssb.inPos[k]]; sumAbs1+=MIN(4+(t&1),t); sumNum+=!!t; }
-      if (numIPos == 1) {
-        UPDATE(0);
-      }
-      else if (numIPos == 2) {
-        UPDATE(0);
-        UPDATE(1);
-      }
-      else if (numIPos == 3) {
-        UPDATE(0);
-        UPDATE(1);
-        UPDATE(2);
-      }
-      else if (numIPos == 4) {
-        UPDATE(0);
-        UPDATE(1);
-        UPDATE(2);
-        UPDATE(3);
-      }
-      else if (numIPos == 5) {
-        UPDATE(0);
-        UPDATE(1);
-        UPDATE(2);
-        UPDATE(3);
-        UPDATE(4);
+      switch (numIPos) {
+        case 5: UPDATE(4);
+        case 4: UPDATE(3);
+        case 3: UPDATE(2);
+        case 2: UPDATE(1);
+        case 1: UPDATE(0); break;
+        default: assert(0);
       }
 #undef UPDATE
       coeff_t sumGt1 = sumAbs1 - sumNum;
@@ -1326,30 +1813,13 @@ static INLINE void updateState(
 
       coeff_t sumAbs = state->m_absLevelsAndCtxInit[state_id][8 + ((scan_pos - 1) & 15)] >> 8;
 #define UPDATE(k) {coeff_t t=levels[next_nb_info_ssb.inPos[k]]; sumAbs+=t; }
-      if (numIPos == 1) {
-        UPDATE(0);
-      }
-      else if (numIPos == 2) {
-        UPDATE(0);
-        UPDATE(1);
-      }
-      else if (numIPos == 3) {
-        UPDATE(0);
-        UPDATE(1);
-        UPDATE(2);
-      }
-      else if (numIPos == 4) {
-        UPDATE(0);
-        UPDATE(1);
-        UPDATE(2);
-        UPDATE(3);
-      }
-      else if (numIPos == 5) {
-        UPDATE(0);
-        UPDATE(1);
-        UPDATE(2);
-        UPDATE(3);
-        UPDATE(4);
+      switch (numIPos) {
+        case 5: UPDATE(4);
+        case 4: UPDATE(3);
+        case 3: UPDATE(2);
+        case 2: UPDATE(1);
+        case 1: UPDATE(0); break;
+        default: assert(0);
       }
 #undef UPDATE
       if (extRiceFlag) {
@@ -1367,30 +1837,13 @@ static INLINE void updateState(
     else {
       coeff_t sumAbs = (state->m_absLevelsAndCtxInit[state_id][8 + ((scan_pos - 1) & 15)]) >> 8;
 #define UPDATE(k) {coeff_t t=levels[next_nb_info_ssb.inPos[k]]; sumAbs+=t; }
-      if (numIPos == 1) {
-        UPDATE(0);
-      }
-      else if (numIPos == 2) {
-        UPDATE(0);
-        UPDATE(1);
-      }
-      else if (numIPos == 3) {
-        UPDATE(0);
-        UPDATE(1);
-        UPDATE(2);
-      }
-      else if (numIPos == 4) {
-        UPDATE(0);
-        UPDATE(1);
-        UPDATE(2);
-        UPDATE(3);
-      }
-      else if (numIPos == 5) {
-        UPDATE(0);
-        UPDATE(1);
-        UPDATE(2);
-        UPDATE(3);
-        UPDATE(4);
+      switch (numIPos) {
+        case 5: UPDATE(4);
+        case 4: UPDATE(3);
+        case 3: UPDATE(2);
+        case 2: UPDATE(1);
+        case 1: UPDATE(0); break;
+        default: assert(0);
       }
 #undef UPDATE
       if (extRiceFlag) {
@@ -1456,11 +1909,11 @@ static void xDecideAndUpdate(
       memcpy(decisions->absLevel + 4, decisions->absLevel, 4 * sizeof(coeff_t));
       memcpy(decisions->rdCost + 4, decisions->rdCost, 4 * sizeof(int64_t));
     } else if (!zeroOut) {
-
-      updateState(ctxs, next_nb_info_ssb.num, scan_pos, decisions, sigCtxOffsetNext, gtxCtxOffsetNext, next_nb_info_ssb, 4, false, 0);
+      update_states_avx2(ctxs, next_nb_info_ssb.num, scan_pos, decisions, sigCtxOffsetNext, gtxCtxOffsetNext, next_nb_info_ssb, 4, false);
+    /*  updateState(ctxs, next_nb_info_ssb.num, scan_pos, decisions, sigCtxOffsetNext, gtxCtxOffsetNext, next_nb_info_ssb, 4, false, 0);
       updateState(ctxs, next_nb_info_ssb.num, scan_pos, decisions, sigCtxOffsetNext, gtxCtxOffsetNext, next_nb_info_ssb, 4, false, 1);
       updateState(ctxs, next_nb_info_ssb.num, scan_pos, decisions, sigCtxOffsetNext, gtxCtxOffsetNext, next_nb_info_ssb, 4, false, 2);
-      updateState(ctxs, next_nb_info_ssb.num, scan_pos, decisions, sigCtxOffsetNext, gtxCtxOffsetNext, next_nb_info_ssb, 4, false, 3);
+      updateState(ctxs, next_nb_info_ssb.num, scan_pos, decisions, sigCtxOffsetNext, gtxCtxOffsetNext, next_nb_info_ssb, 4, false, 3);*/
     }
 
     if (spt == SCAN_SOCSBB) {
@@ -1596,7 +2049,7 @@ int uvg_dep_quant(
 
     dep_quant_context.m_allStates.m_stateId[k] = k & 3;
     for (int i = 0; i < (compID == COLOR_Y ? 12 : 8); ++i) {
-      dep_quant_context.m_allStates.m_sigFracBitsArray[k][i] = rate_estimator.m_sigFracBits[(k & 3 ? (k & 3) - 1 : 0)][i];
+      memcpy(dep_quant_context.m_allStates.m_sigFracBitsArray[k][i], rate_estimator.m_sigFracBits[(k & 3 ? (k & 3) - 1 : 0)][i], sizeof(uint32_t) * 2);
     }
   }
 

From 8f4c3cecbfc28235cb7b5d6014955db0c7840b05 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 12 Apr 2023 10:41:37 +0300
Subject: [PATCH 214/254] [avx2] update_states_avx2 working

---
 src/dep_quant.c | 159 +++++++++++++++++++++++++++++-------------------
 1 file changed, 95 insertions(+), 64 deletions(-)

diff --git a/src/dep_quant.c b/src/dep_quant.c
index 78a039bb..270e0639 100644
--- a/src/dep_quant.c
+++ b/src/dep_quant.c
@@ -158,11 +158,14 @@ typedef struct
   int8_t m_goRiceZero[12];
   int8_t m_stateId[12];
   uint32_t m_sigFracBitsArray[12][12][2];
-  int32_t *m_gtxFracBitsArray[21];
+  int32_t  m_gtxFracBitsArray[21][6];
   common_context* m_commonCtx;
 
   unsigned effWidth;
   unsigned effHeight;
+
+  bool all_gte_four;
+  bool all_lt_four;
 } all_depquant_states;
 
 typedef struct
@@ -577,14 +580,8 @@ static void check_rd_costs_avx2(const all_depquant_states* const state, const en
   rd_cost_a = _mm256_add_epi64(rd_cost_a, pq_a_delta_dist);
   rd_cost_b = _mm256_add_epi64(rd_cost_b, pq_b_delta_dist);
 
-  bool all_over_or_four = true;
-  bool all_under_four = true;
-  for (int i = 0; i < 4; i++) {
-    all_over_or_four &= state->m_remRegBins[start + i] >= 4;
-    all_under_four &= state->m_remRegBins[start + i] < 4;
-  }
 
-  if (all_over_or_four) {
+  if (state->all_gte_four) {
     if (pqDataA->absLevel[0] < 4 && pqDataA->absLevel[3] < 4) {
       __m128i offsets = _mm_set_epi32(18 + pqDataA->absLevel[3], 12 + pqDataA->absLevel[3], 6 + pqDataA->absLevel[0], 0 + pqDataA->absLevel[0]);
       __m128i coeff_frac_bits = _mm_i32gather_epi32(&state->m_coeffFracBits[start][0], offsets, 4);
@@ -737,7 +734,7 @@ static void check_rd_costs_avx2(const all_depquant_states* const state, const en
     _mm256_storeu_epi64(temp_rd_cost_a, rd_cost_a);
     _mm256_storeu_epi64(temp_rd_cost_b, rd_cost_b);
     _mm256_storeu_epi64(temp_rd_cost_z, rd_cost_z);
-  } else if (all_under_four) {
+  } else if (state->all_lt_four) {
     __m128i scale_bits = _mm_set1_epi32(1 << SCALE_BITS);
     __m128i max_rice = _mm_set1_epi32(31);
     __m128i go_rice_zero = _mm_cvtepi8_epi32(_mm_loadu_epi8(&state->m_goRiceZero[start]));
@@ -1274,6 +1271,8 @@ static INLINE void update_states_avx2(
     all_minus_one &= decisions->prevId[i] == -1;
   }
   int state_offset = ctxs->m_curr_state_offset;
+  __m256i rd_cost = _mm256_loadu_epi64(decisions->rdCost);
+  _mm256_storeu_epi64(&ctxs->m_allStates.m_rdCost[state_offset], rd_cost);
   if (all_above_minus_two) {
 
     bool    rem_reg_all_gte_4 = true;
@@ -1312,7 +1311,7 @@ static INLINE void update_states_avx2(
       memcpy(&state->m_goRicePar[state_offset], &go_rice_par_i, 4);
 
       
-      __m256i sbb_frac_bits = _mm256_i32gather_epi64(state->m_sbbFracBits, prv_states, 4);
+      __m256i sbb_frac_bits = _mm256_i32gather_epi64(state->m_sbbFracBits, prv_states, 8);
       _mm256_storeu_epi64(&state->m_sbbFracBits[state_offset][0], sbb_frac_bits);
 
       __m128i rem_reg_bins = _mm_i32gather_epi32(state->m_remRegBins, prv_states, 4);
@@ -1321,7 +1320,7 @@ static INLINE void update_states_avx2(
 
       __m128i reg_bins_sub = _mm_set1_epi32(0);
       __m128i abs_level_smaller_than_two = _mm_cmplt_epi32(abs_level, _mm_set1_epi32(2));
-      __m128i secondary = _mm_blendv_epi8(abs_level, _mm_set1_epi32(3), abs_level_smaller_than_two);
+      __m128i secondary = _mm_blendv_epi8(_mm_set1_epi32(3), abs_level, abs_level_smaller_than_two);
 
       __m128i rem_reg_bins_smaller_than_four = _mm_cmplt_epi32(rem_reg_bins, _mm_set1_epi32(4));
       reg_bins_sub = _mm_blendv_epi8(secondary, reg_bins_sub, rem_reg_bins_smaller_than_four);
@@ -1336,7 +1335,7 @@ static INLINE void update_states_avx2(
       rem_reg_all_lt4 = (bit_mask == 0xFFFF);
 
       for (int i = 0; i < 4; ++i) {
-        memcpy(state->m_absLevelsAndCtxInit[i], state->m_absLevelsAndCtxInit[prv_states_scalar[i]], 48 * sizeof(uint8_t));        
+        memcpy(state->m_absLevelsAndCtxInit[state_offset + i], state->m_absLevelsAndCtxInit[prv_states_scalar[i]], 48 * sizeof(uint8_t));        
       }
     }
     else if (all_minus_one) {
@@ -1347,8 +1346,8 @@ static INLINE void update_states_avx2(
 
       __m128i   rem_reg_bins = _mm_set1_epi32(a);
       __m128i   sub = _mm_blendv_epi8(
-        abs_level,
         _mm_set1_epi32(3),
+        abs_level,
         _mm_cmplt_epi32(abs_level, _mm_set1_epi32(2))
       );
       rem_reg_bins = _mm_sub_epi32(rem_reg_bins, sub);
@@ -1400,18 +1399,20 @@ static INLINE void update_states_avx2(
       uint8_t* levels = (uint8_t*)state->m_absLevelsAndCtxInit[state_offset + i];
       levels[level_offset] = max_abs_s[i];
     }
-
+    state->all_gte_four = rem_reg_all_gte_4;
+    state->all_lt_four = rem_reg_all_lt4;
     if (rem_reg_all_gte_4) {
-      const __m128i  last_two_bytes = _mm_set1_epi32(0xffff);
-      const __m128i  last_byte = _mm_set1_epi32(0xff);
+      const __m128i  first_two_bytes = _mm_set1_epi32(0xffff);
+      const __m128i  first_byte = _mm_set1_epi32(0xff);
       const __m128i  ones = _mm_set1_epi32(1);
       const uint32_t tinit_offset = MIN(level_offset - 1u, 15u) + 8;
       const __m128i levels_start_offsets = _mm_set_epi32(48 * 3, 48 * 2, 48 * 1, 48 * 0);
+      const __m128i ctx_start_offsets = _mm_srli_epi32(levels_start_offsets, 1);
       __m128i        tinit = _mm_i32gather_epi32(
         state->m_absLevelsAndCtxInit[state_offset],
-        _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(tinit_offset)),
-        1);
-      tinit = _mm_and_epi32(tinit, last_two_bytes);
+        _mm_add_epi32(ctx_start_offsets, _mm_set1_epi32(tinit_offset)),
+        2);
+      tinit = _mm_and_epi32(tinit, first_two_bytes);
       __m128i sum_abs1 = _mm_and_epi32(_mm_srli_epi32(tinit, 3), _mm_set1_epi32(31));
       __m128i sum_num = _mm_and_epi32(tinit, _mm_set1_epi32(7));
 
@@ -1423,12 +1424,18 @@ static INLINE void update_states_avx2(
             levels,
             _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[4])),
             1);
+          t = _mm_and_epi32(t, first_byte);
+          __m128i min_arg = _mm_min_epi32(
+            _mm_add_epi32(_mm_set1_epi32(4), _mm_and_epi32(t, ones)),
+            t
+          );
           sum_abs1 = _mm_add_epi32(
             sum_abs1,
-            _mm_and_epi32(t, ones));
+            min_arg
+          );
           sum_num = _mm_add_epi32(
             sum_num,
-            _mm_min_epi32(_mm_and_epi32(t, last_byte), ones));
+            _mm_min_epi32(_mm_and_epi32(t, first_byte), ones));
         }
       case 4:
         {
@@ -1436,12 +1443,18 @@ static INLINE void update_states_avx2(
             levels,
             _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[3])),
             1);
+          t = _mm_and_epi32(t, first_byte);
+          __m128i min_arg = _mm_min_epi32(
+            _mm_add_epi32(_mm_set1_epi32(4), _mm_and_epi32(t, ones)),
+            t
+          );
           sum_abs1 = _mm_add_epi32(
             sum_abs1,
-            _mm_and_epi32(t, ones));
+            min_arg
+          );
           sum_num = _mm_add_epi32(
             sum_num,
-            _mm_min_epi32(_mm_and_epi32(t, last_byte), ones));
+            _mm_min_epi32(_mm_and_epi32(t, first_byte), ones));
         }
       case 3:
         {
@@ -1449,12 +1462,18 @@ static INLINE void update_states_avx2(
             levels,
             _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[2])),
             1);
+          t = _mm_and_epi32(t, first_byte);
+          __m128i min_arg = _mm_min_epi32(
+            _mm_add_epi32(_mm_set1_epi32(4), _mm_and_epi32(t, ones)),
+            t
+          );
           sum_abs1 = _mm_add_epi32(
             sum_abs1,
-            _mm_and_epi32(t, ones));
+            min_arg
+          );
           sum_num = _mm_add_epi32(
             sum_num,
-            _mm_min_epi32(_mm_and_epi32(t, last_byte), ones));
+            _mm_min_epi32(_mm_and_epi32(t, first_byte), ones));
         }
       case 2:
         {
@@ -1462,39 +1481,52 @@ static INLINE void update_states_avx2(
             levels,
             _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[1])),
             1);
+          t = _mm_and_epi32(t, first_byte);
+        __m128i min_arg = _mm_min_epi32(
+              _mm_add_epi32(_mm_set1_epi32(4), _mm_and_epi32(t, ones)),
+              t
+            );
           sum_abs1 = _mm_add_epi32(
             sum_abs1,
-            _mm_and_epi32(t, ones));
+            min_arg
+          );
           sum_num = _mm_add_epi32(
             sum_num,
-            _mm_min_epi32(_mm_and_epi32(t, last_byte), ones));
+            _mm_min_epi32(_mm_and_epi32(t, first_byte), ones));
         }
       case 1: {
           __m128i t = _mm_i32gather_epi32(
             levels,
             _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[0])),
             1);
+          t = _mm_and_epi32(t, first_byte);
+          __m128i min_arg = _mm_min_epi32(
+            _mm_add_epi32(_mm_set1_epi32(4), _mm_and_epi32(t, ones)),
+            t
+          );
           sum_abs1 = _mm_add_epi32(
             sum_abs1,
-            _mm_and_epi32(t, ones));
+            min_arg
+            );
           sum_num = _mm_add_epi32(
             sum_num,
-            _mm_min_epi32(_mm_and_epi32(t, last_byte), ones));
+            _mm_min_epi32(_mm_and_epi32(t, first_byte), ones));
         } break;
       default:
           assert(0);
       }
       __m128i sum_gt1 = _mm_sub_epi32(sum_abs1, sum_num);
-      __m128i  offsets = _mm_set_epi32(24 * 3, 24 * 2, 24 * 1, 24 * 0);
+      __m128i  offsets = _mm_set_epi32(12 * 3, 12 * 2, 12 * 1, 12 * 0);
       offsets = _mm_add_epi32(offsets, _mm_set1_epi32(sigCtxOffsetNext));
       __m128i temp = _mm_min_epi32(
         _mm_srli_epi32(_mm_add_epi32(sum_abs1, ones), 1),
         _mm_set1_epi32(3));
       offsets = _mm_add_epi32(offsets, temp);
-      __m256i sig_frac_bits = _mm256_i32gather_epi64(state->m_sigFracBitsArray[state_offset][0], offsets, 4);
+      __m256i sig_frac_bits = _mm256_i32gather_epi64(state->m_sigFracBitsArray[state_offset][0], offsets, 8);
       _mm256_storeu_epi64(&state->m_sigFracBits[state_offset][0], sig_frac_bits);
 
       sum_gt1 = _mm_min_epi32(sum_gt1, _mm_set1_epi32(4));
+      sum_gt1 = _mm_add_epi32(sum_gt1, _mm_set1_epi32(gtxCtxOffsetNext));
       uint32_t sum_gt1_s[4];
       _mm_storeu_epi32(sum_gt1_s, sum_gt1);
       for (int i = 0; i < 4; ++i) {
@@ -1509,7 +1541,7 @@ static INLINE void update_states_avx2(
             levels,
             _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[4])),
             1);
-            t = _mm_and_epi32(t, last_byte);
+            t = _mm_and_epi32(t, first_byte);
           sum_abs = _mm_add_epi32(sum_abs, t);
           }
         case 4:
@@ -1518,7 +1550,7 @@ static INLINE void update_states_avx2(
             levels,
             _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[3])),
             1);
-            t = _mm_and_epi32(t, last_byte);
+            t = _mm_and_epi32(t, first_byte);
           sum_abs = _mm_add_epi32(sum_abs, t);
           }
         case 3:
@@ -1527,7 +1559,7 @@ static INLINE void update_states_avx2(
             levels,
             _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[2])),
             1);
-            t = _mm_and_epi32(t, last_byte);
+            t = _mm_and_epi32(t, first_byte);
           sum_abs = _mm_add_epi32(sum_abs, t);
           }
         case 2:
@@ -1536,7 +1568,7 @@ static INLINE void update_states_avx2(
             levels,
             _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[1])),
             1);
-            t = _mm_and_epi32(t, last_byte);
+            t = _mm_and_epi32(t, first_byte);
           sum_abs = _mm_add_epi32(sum_abs, t);
           }
         case 1:
@@ -1545,7 +1577,7 @@ static INLINE void update_states_avx2(
             levels,
             _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[0])),
             1);
-            t = _mm_and_epi32(t, last_byte);
+            t = _mm_and_epi32(t, first_byte);
           sum_abs = _mm_add_epi32(sum_abs, t);
           } break;
         default:
@@ -1560,7 +1592,10 @@ static INLINE void update_states_avx2(
             _mm_sub_epi32(sum_abs, _mm_set1_epi32(20))),
           _mm_set1_epi32(0));
         __m128i temp = _mm_i32gather_epi32(g_goRiceParsCoeff, sum_all, 4);
-        _mm_storeu_epi32(&state->m_goRicePar[state_offset], temp);
+        __m128i control = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+        __m128i go_rice_par = _mm_shuffle_epi8(temp, control);
+        int     go_rice_par_i = _mm_extract_epi32(go_rice_par, 0);
+        memcpy(&state->m_goRicePar[state_offset], &go_rice_par_i, 4);
       }
     }
 
@@ -1571,10 +1606,11 @@ static INLINE void update_states_avx2(
       const __m128i  ones = _mm_set1_epi32(1);
       const uint32_t tinit_offset = MIN(level_offset - 1u, 15u) + 8;
       const __m128i levels_start_offsets = _mm_set_epi32(48 * 3, 48 * 2, 48 * 1, 48 * 0);
+      const __m128i ctx_start_offsets = _mm_srli_epi32(levels_start_offsets, 1);
       __m128i       tinit = _mm_i32gather_epi32(
         state->m_absLevelsAndCtxInit[state_offset],
-        _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(tinit_offset)),
-        1);
+        _mm_add_epi32(ctx_start_offsets, _mm_set1_epi32(tinit_offset)),
+        2);
       tinit = _mm_and_epi32(tinit, last_two_bytes);
       __m128i sum_abs = _mm_srli_epi32(tinit, 8);
       switch (numIPos) {
@@ -1624,22 +1660,19 @@ static INLINE void update_states_avx2(
       if (extRiceFlag) {
         assert(0 && "Not implemented for avx2");
       } else {
-        __m128i sum_all = _mm_max_epi32(
-          _mm_min_epi32(
-            _mm_set1_epi32(31),
-            _mm_sub_epi32(sum_abs, _mm_set1_epi32(20))),
-          _mm_set1_epi32(0));
+        __m128i sum_all = _mm_min_epi32(_mm_set1_epi32(31), sum_abs);
         __m128i temp = _mm_i32gather_epi32(g_goRiceParsCoeff, sum_all, 4);
         __m128i control = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
         __m128i go_rice_par = _mm_shuffle_epi8(temp, control);
         int     go_rice_par_i = _mm_extract_epi32(go_rice_par, 0);
         memcpy(&state->m_goRicePar[state_offset], &go_rice_par_i, 4);
 
-        __m128i go_rice_zero = _mm_set_epi32(2, 2, 1, 1);
-        go_rice_zero = _mm_sll_epi32(go_rice_zero, temp);
-        go_rice_zero = _mm_shuffle_epi8(go_rice_zero, control);
-        int go_rice_zero_i = _mm_extract_epi32(go_rice_par, 0);
-        memcpy(&state->m_goRiceZero[state_offset], &go_rice_zero_i, 4);
+        
+        for (int i = 0; i < 4; ++i) {
+          state->m_goRiceZero[state_offset + i] = (i < 2 ? 1 : 2) << state->m_goRicePar[state_offset + i];
+          
+        }
+
       }
 
     }
@@ -1729,6 +1762,8 @@ static INLINE void update_states_avx2(
     }
   } else {
     for (int i = 0; i < 4; ++i) {
+      state->all_gte_four = true;
+      state->all_lt_four = true;
       updateState(
         ctxs,
         numIPos,
@@ -1758,7 +1793,7 @@ static INLINE void updateState(
   int             decision_id) {
   all_depquant_states* state = &ctxs->m_allStates;
   int state_id = ctxs->m_curr_state_offset + decision_id;
-  state->m_rdCost[state_id] = decisions->rdCost[decision_id];
+  // state->m_rdCost[state_id] = decisions->rdCost[decision_id];
   if (decisions->prevId[decision_id] > -2) {
     if (decisions->prevId[decision_id] >= 0) {
       const int prvState = ctxs->m_prev_state_offset + decisions->prevId[decision_id];
@@ -1784,7 +1819,8 @@ static INLINE void updateState(
         decisions->absLevel[decision_id] < 2 ? (unsigned)decisions->absLevel[decision_id] : 3);
       memset(state->m_absLevelsAndCtxInit[state_id], 0, 48 * sizeof(uint8_t));
     }
-
+    state->all_gte_four &= state->m_remRegBins[state_id] >= 4;
+    state->all_lt_four &= state->m_remRegBins[state_id] < 4;
     uint8_t* levels = (uint8_t*)(state->m_absLevelsAndCtxInit[state_id]);
     levels[scan_pos & 15] = (uint8_t)MIN(255, decisions->absLevel[decision_id]);
 
@@ -1860,6 +1896,10 @@ static INLINE void updateState(
       state->m_goRiceZero[state_id] = ((state_id & 3) < 2 ? 1 : 2) << state->m_goRicePar[state_id];
     }
   }
+  else {
+    state->all_gte_four &= state->m_remRegBins[state_id] >= 4;
+    state->all_lt_four &= state->m_remRegBins[state_id] < 4;
+  }
 }
 
 static bool same[13];
@@ -1946,18 +1986,7 @@ int uvg_dep_quant(
   const uint32_t  lfnstIdx = tree_type != UVG_CHROMA_T  || compID == COLOR_Y ?
                                cur_tu->lfnst_idx :
                                cur_tu->cr_lfnst_idx;
-
-  int8_t          t[4] = {2, 2, 2, 2};
-  __m128i         pq_abs_a = _mm_set_epi32(16, 0, 16, 0);
-  __m128i         go_rice_zero = _mm_cvtepi8_epi32(_mm_loadu_epi8(t));
-  __m128i         cmp = _mm_cmplt_epi32(go_rice_zero, pq_abs_a);
-
-  __m128i         max_rice = _mm_set1_epi32(15);
-  __m128i         go_rice_smaller = _mm_min_epi32(pq_abs_a, max_rice);
-
-  __m128i         other = _mm_sub_epi32(pq_abs_a, _mm_set1_epi32(1));
-  __m128i         selected = _mm_blendv_epi8(go_rice_zero, other, cmp);
-
+  
   const int       numCoeff = width * height;
 
   memset(coeff_out, 0x00, width * height * sizeof(coeff_t));
@@ -2055,9 +2084,11 @@ int uvg_dep_quant(
 
   dep_quant_context.m_allStates.effHeight = effectHeight;
   dep_quant_context.m_allStates.effWidth = effectWidth;
+  dep_quant_context.m_allStates.all_gte_four = true;
+  dep_quant_context.m_allStates.all_lt_four = false;
   dep_quant_context.m_allStates.m_commonCtx = &dep_quant_context.m_common_context;
   for (int i = 0; i < (compID == COLOR_Y ? 21 : 11); ++i) {
-    dep_quant_context.m_allStates.m_gtxFracBitsArray[i] = rate_estimator.m_gtxFracBits[i];
+    memcpy(dep_quant_context.m_allStates.m_gtxFracBitsArray[i], rate_estimator.m_gtxFracBits[i], sizeof(int32_t) * 6);
   }
 
   depquant_state_init(&dep_quant_context.m_startState, rate_estimator.m_sigFracBits[0][0], rate_estimator.m_gtxFracBits[0]);

From cd6110cfac238fe425087b47618a63dd75949e97 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 12 Apr 2023 15:02:06 +0300
Subject: [PATCH 215/254] [depquant] Pre calculate things

sig_ctx_offset gtx_ctx_offset cg_pos pos_y  pos_x next_sbb_right next_sbb_below
---
 src/dep_quant.c | 122 ++++++++++++++++++++++++------------------------
 src/dep_quant.h |  11 +++++
 src/encoder.h   |   1 +
 3 files changed, 74 insertions(+), 60 deletions(-)

diff --git a/src/dep_quant.c b/src/dep_quant.c
index 270e0639..7de8828f 100644
--- a/src/dep_quant.c
+++ b/src/dep_quant.c
@@ -184,6 +184,7 @@ typedef struct
 int uvg_init_nb_info(encoder_control_t * encoder) {
   memset(encoder->m_scanId2NbInfoSbbArray, 0, sizeof(encoder->m_scanId2NbInfoSbbArray));
   memset(encoder->m_scanId2NbInfoOutArray, 0, sizeof(encoder->m_scanId2NbInfoOutArray));
+  memset(encoder->scan_info, 0, sizeof(encoder->scan_info));
   for (int hd = 0; hd <= 6; hd++)
   {
 
@@ -206,6 +207,7 @@ int uvg_init_nb_info(encoder_control_t * encoder) {
       const uint32_t      blkWidthIdx = hd;
       const uint32_t      blkHeightIdx = vd;
       const uint32_t* scanId2RP = uvg_get_scan_order_table(SCAN_GROUP_4X4, scanType, blkWidthIdx, blkHeightIdx);
+      const uint32_t* const cg_scan = uvg_get_scan_order_table(SCAN_GROUP_UNGROUPED, 0, hd, vd);
       NbInfoSbb** sId2NbSbb = &encoder->m_scanId2NbInfoSbbArray[hd][vd];
       NbInfoOut** sId2NbOut = &encoder->m_scanId2NbInfoOutArray[hd][vd];
       // consider only non-zero-out region
@@ -221,11 +223,18 @@ int uvg_init_nb_info(encoder_control_t * encoder) {
       if (*sId2NbOut == NULL) {
         return 0;
       }
+      encoder->scan_info[hd][vd] = MALLOC(struct dep_quant_scan_info, totalValues);
+      if (encoder->scan_info[hd][vd] == NULL) {
+        return 0;
+      }
+
 
       for (uint32_t scanId = 0; scanId < totalValues; scanId++)
       {
         raster2id[scanId2RP[scanId]] = scanId;
       }
+      const uint32_t height_in_sbb = MAX(blockHeight >> 2, 1);
+      const uint32_t width_in_sbb = MAX(blockWidth >> 2, 1);
 
       for (unsigned scanId = 0; scanId < totalValues; scanId++)
       {
@@ -309,6 +318,28 @@ int uvg_init_nb_info(encoder_control_t * encoder) {
             }
           }
         }
+        uint32_t cg_pos = cg_scan[scanId >> 4];
+
+        uint32_t blkpos_next = scanId2RP[scanId ? scanId - 1 : 0];
+        uint32_t  pos_y_next = blkpos_next >> hd;
+        uint32_t  pos_x_next = blkpos_next - (pos_y_next << hd);
+        uint32_t cg_blockpos_next = scanId ? cg_scan[(scanId - 1) >> 4] : 0;
+        uint32_t cg_pos_y_next = cg_blockpos_next / width_in_sbb;
+        uint32_t cg_pos_x_next = cg_blockpos_next - (cg_pos_y_next * width_in_sbb);
+        uint32_t diag = pos_y_next + pos_x_next;
+        
+
+        uint32_t nextSbbRight = (cg_pos_x_next < width_in_sbb - 1 ? cg_blockpos_next + 1 : 0);
+        uint32_t nextSbbBelow = (cg_pos_y_next < height_in_sbb - 1 ? cg_blockpos_next + width_in_sbb : 0);
+        encoder->scan_info[hd][vd][scanId].pos_x = pos_x;
+        encoder->scan_info[hd][vd][scanId].pos_y = pos_y;
+        encoder->scan_info[hd][vd][scanId].sig_ctx_offset[0] = (diag < 2 ? 8 : diag < 5 ? 4 : 0);
+        encoder->scan_info[hd][vd][scanId].sig_ctx_offset[1] = (diag < 2 ? 4 : 0);
+        encoder->scan_info[hd][vd][scanId].gtx_ctx_offset[0] = (diag < 1 ? 16 : diag < 3 ? 11 : diag < 10 ? 6 : 1);
+        encoder->scan_info[hd][vd][scanId].gtx_ctx_offset[1] = (diag < 1 ? 6 : 1);
+        encoder->scan_info[hd][vd][scanId].cg_pos = cg_pos;
+        encoder->scan_info[hd][vd][scanId].next_sbb_right = nextSbbRight;
+        encoder->scan_info[hd][vd][scanId].next_sbb_below = nextSbbBelow;
       }
 
       // make it relative
@@ -338,6 +369,7 @@ void uvg_dealloc_nb_info(encoder_control_t* encoder) {
       }
       if(encoder->m_scanId2NbInfoOutArray[hd][vd]) FREE_POINTER(encoder->m_scanId2NbInfoOutArray[hd][vd]);
       if(encoder->m_scanId2NbInfoOutArray[hd][vd]) FREE_POINTER(encoder->m_scanId2NbInfoSbbArray[hd][vd]);
+      if(encoder->scan_info[hd][vd]) FREE_POINTER(encoder->scan_info[hd][vd]);
     }
   }
 }
@@ -1904,24 +1936,19 @@ static INLINE void updateState(
 
 static bool same[13];
 static void xDecideAndUpdate(
-  rate_estimator* re,
-  context_store* ctxs,
-  const coeff_t absCoeff,
-  const uint32_t scan_pos,
-  const uint32_t cg_pos,
-  const uint32_t pos_x,
-  const uint32_t pos_y,
-  const uint32_t sigCtxOffsetNext,
-  const uint32_t gtxCtxOffsetNext,
-  const uint32_t width_in_sbb,
-  const uint32_t height_in_sbb,
-  const uint32_t next_sbb_right,
-  const uint32_t next_sbb_below,
-  const NbInfoSbb next_nb_info_ssb,
-  bool zeroOut,
-  coeff_t quantCoeff,
-  int effWidth,
-  int effHeight)
+  rate_estimator*                         re,
+  context_store*                          ctxs,
+  struct dep_quant_scan_info const* const scan_info,
+  const coeff_t                           absCoeff,
+  const uint32_t                          scan_pos,
+  const uint32_t                          width_in_sbb,
+  const uint32_t                          height_in_sbb,
+  const NbInfoSbb                         next_nb_info_ssb,
+  bool                                    zeroOut,
+  coeff_t                                 quantCoeff,
+  int                                     effWidth,
+  int                                     effHeight,
+  bool                                    is_chroma)
 {
   Decision* decisions = &ctxs->m_trellis[scan_pos];
   SWAP(ctxs->m_curr_state_offset, ctxs->m_prev_state_offset, int);
@@ -1936,20 +1963,20 @@ static void xDecideAndUpdate(
     spt = SCAN_EOCSBB;
   }
 
-  xDecide(&ctxs->m_allStates, &ctxs->m_startState, &ctxs->m_quant, spt, absCoeff, re->m_lastBitsX[pos_x] + re->m_lastBitsY[pos_y], decisions, zeroOut, quantCoeff,ctxs->m_skip_state_offset, ctxs->m_prev_state_offset);
+  xDecide(&ctxs->m_allStates, &ctxs->m_startState, &ctxs->m_quant, spt, absCoeff, re->m_lastBitsX[scan_info->pos_x] + re->m_lastBitsY[scan_info->pos_y], decisions, zeroOut, quantCoeff,ctxs->m_skip_state_offset, ctxs->m_prev_state_offset);
 
   if (scan_pos) {
     if (!(scan_pos & 15)) {
       SWAP(ctxs->m_common_context.m_currSbbCtx, ctxs->m_common_context.m_prevSbbCtx, SbbCtx*);
-      updateStateEOS(ctxs, scan_pos, cg_pos, sigCtxOffsetNext, gtxCtxOffsetNext, width_in_sbb, height_in_sbb, next_sbb_right, next_sbb_below, decisions, 0);
-      updateStateEOS(ctxs, scan_pos, cg_pos, sigCtxOffsetNext, gtxCtxOffsetNext, width_in_sbb, height_in_sbb, next_sbb_right, next_sbb_below, decisions, 1);
-      updateStateEOS(ctxs, scan_pos, cg_pos, sigCtxOffsetNext, gtxCtxOffsetNext, width_in_sbb, height_in_sbb, next_sbb_right, next_sbb_below, decisions, 2);
-      updateStateEOS(ctxs, scan_pos, cg_pos, sigCtxOffsetNext, gtxCtxOffsetNext, width_in_sbb, height_in_sbb, next_sbb_right, next_sbb_below, decisions, 3);
+      updateStateEOS(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions, 0);
+      updateStateEOS(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions, 1);
+      updateStateEOS(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions, 2);
+      updateStateEOS(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions, 3);
       memcpy(decisions->prevId + 4, decisions->prevId, 4 * sizeof(int));
       memcpy(decisions->absLevel + 4, decisions->absLevel, 4 * sizeof(coeff_t));
       memcpy(decisions->rdCost + 4, decisions->rdCost, 4 * sizeof(int64_t));
     } else if (!zeroOut) {
-      update_states_avx2(ctxs, next_nb_info_ssb.num, scan_pos, decisions, sigCtxOffsetNext, gtxCtxOffsetNext, next_nb_info_ssb, 4, false);
+      update_states_avx2(ctxs, next_nb_info_ssb.num, scan_pos, decisions, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], next_nb_info_ssb, 4, false);
     /*  updateState(ctxs, next_nb_info_ssb.num, scan_pos, decisions, sigCtxOffsetNext, gtxCtxOffsetNext, next_nb_info_ssb, 4, false, 0);
       updateState(ctxs, next_nb_info_ssb.num, scan_pos, decisions, sigCtxOffsetNext, gtxCtxOffsetNext, next_nb_info_ssb, 4, false, 1);
       updateState(ctxs, next_nb_info_ssb.num, scan_pos, decisions, sigCtxOffsetNext, gtxCtxOffsetNext, next_nb_info_ssb, 4, false, 2);
@@ -2105,26 +2132,11 @@ int uvg_dep_quant(
 
   const uint32_t height_in_sbb = MAX(height >> 2, 1);
   const uint32_t width_in_sbb = MAX(width >> 2, 1);
+
   //===== populate trellis =====
   for (int scanIdx = firstTestPos; scanIdx >= 0; scanIdx--) {
     uint32_t blkpos = scan[scanIdx];
-    uint32_t  pos_y = blkpos >> log2_tr_width;
-    uint32_t  pos_x = blkpos - (pos_y << log2_tr_width);
-    uint32_t cg_pos = cg_scan[scanIdx >> 4];
-
-    uint32_t blkpos_next = scan[scanIdx ? scanIdx - 1 : 0];
-    uint32_t  pos_y_next = blkpos_next >> log2_tr_width;
-    uint32_t  pos_x_next = blkpos_next - (pos_y_next << log2_tr_width);
-    uint32_t cg_blockpos_next = scanIdx ? cg_scan[(scanIdx -1) >> 4] : 0;
-    uint32_t cg_pos_y_next = cg_blockpos_next / width_in_sbb;
-    uint32_t cg_pos_x_next = cg_blockpos_next - (cg_pos_y_next * width_in_sbb);
-    uint32_t diag = pos_y_next + pos_x_next;
-
-    uint32_t sig_ctx_offset = compID == COLOR_Y ? (diag < 2 ? 8 : diag < 5 ? 4 : 0) : (diag < 2 ? 4 : 0);
-    uint32_t gtx_ctx_offset = compID == COLOR_Y ? (diag < 1 ? 16 : diag < 3 ? 11 : diag < 10 ? 6 : 1) : (diag < 1 ? 6 : 1);
-
-    uint32_t nextSbbRight = (cg_pos_x_next < width_in_sbb - 1 ? cg_blockpos_next + 1 : 0);
-    uint32_t nextSbbBelow = (cg_pos_y_next < height_in_sbb - 1 ? cg_blockpos_next + width_in_sbb : 0);
+    struct dep_quant_scan_info* scan_info = &encoder->scan_info[log2_tr_width][log2_tr_height][scanIdx];
 
     context_store* ctxs = &dep_quant_context;
     if (enableScalingLists) {
@@ -2133,44 +2145,34 @@ int uvg_dep_quant(
       xDecideAndUpdate(
         &rate_estimator,
         ctxs,
+        scan_info,
         abs(srcCoeff[blkpos]),
         scanIdx,
-        cg_pos,
-        pos_x,
-        pos_y,
-        sig_ctx_offset,
-        gtx_ctx_offset,
         width_in_sbb,
         height_in_sbb,
-        nextSbbRight,
-        nextSbbBelow,
         encoder->m_scanId2NbInfoSbbArray[log2_tr_width][log2_tr_height][scanIdx ? scanIdx - 1 : 0],
-        (zeroOut && (pos_x >= effWidth || pos_y >= effHeight)),
+        (zeroOut && (scan_info->pos_x >= effWidth || scan_info->pos_y >= effHeight)),
         q_coeff[blkpos],
         width,
-        height
-      ); //tu.cu->slice->getReverseLastSigCoeffFlag());
+        height,
+        compID != 0
+        ); //tu.cu->slice->getReverseLastSigCoeffFlag());
     }
     else {
       xDecideAndUpdate(
         &rate_estimator,
         ctxs,
+        scan_info,
         abs(srcCoeff[blkpos]),
         scanIdx,
-        cg_pos,
-        pos_x,
-        pos_y,
-        sig_ctx_offset,
-        gtx_ctx_offset,
         width_in_sbb,
         height_in_sbb,
-        nextSbbRight,
-        nextSbbBelow,
         encoder->m_scanId2NbInfoSbbArray[log2_tr_width][log2_tr_height][scanIdx ? scanIdx - 1 : 0],
-        (zeroOut && (pos_x >= effWidth || pos_y >= effHeight)),
+        (zeroOut && (scan_info->pos_x >= effWidth || scan_info->pos_y >= effHeight)),
         default_quant_coeff,
         width,
-        height); //tu.cu->slice->getReverseLastSigCoeffFlag());
+        height,
+        compID != 0); //tu.cu->slice->getReverseLastSigCoeffFlag());
     }
     if(0){
       printf("%d\n", scanIdx);
diff --git a/src/dep_quant.h b/src/dep_quant.h
index c3fb69a4..a8483e40 100644
--- a/src/dep_quant.h
+++ b/src/dep_quant.h
@@ -38,6 +38,17 @@
 
 typedef struct encoder_control_t encoder_control_t;
 
+struct dep_quant_scan_info
+{
+  uint8_t sig_ctx_offset[2];
+  uint8_t gtx_ctx_offset[2];
+  uint16_t cg_pos;
+  uint16_t  pos_y;
+  uint16_t  pos_x;
+  uint8_t next_sbb_right;
+  uint8_t next_sbb_below;
+};
+
 typedef struct
 {
   uint8_t num;
diff --git a/src/encoder.h b/src/encoder.h
index 81b091b3..05750292 100644
--- a/src/encoder.h
+++ b/src/encoder.h
@@ -101,6 +101,7 @@ typedef struct encoder_control_t
 
   NbInfoSbb* m_scanId2NbInfoSbbArray[7 + 1][7 + 1];
   NbInfoOut* m_scanId2NbInfoOutArray[7 + 1][7 + 1];
+  struct dep_quant_scan_info* scan_info[7 + 1][7 + 1];
 
   //spec: references to variables defined in Rec. ITU-T H.265 (04/2013)
   int8_t tiles_enable; /*!<spec: tiles_enabled */

From aa48943c22530d1afffe4c67f043963b6c60f8f5 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 12 Apr 2023 15:36:45 +0300
Subject: [PATCH 216/254] [avx2] Do decision cost comparison with avx2

---
 src/dep_quant.c | 102 +++++++++++++++---------------------------------
 1 file changed, 32 insertions(+), 70 deletions(-)

diff --git a/src/dep_quant.c b/src/dep_quant.c
index 7de8828f..b6158f68 100644
--- a/src/dep_quant.c
+++ b/src/dep_quant.c
@@ -97,8 +97,8 @@ typedef struct
 typedef struct
 {
   int64_t rdCost[8];
-  coeff_t absLevel[8];
-  int prevId[8];
+  int32_t absLevel[8];
+  int32_t prevId[8];
 } Decision;
 
 
@@ -877,73 +877,36 @@ static void check_rd_costs_avx2(const all_depquant_states* const state, const en
     rd_cost_b = _mm256_loadu_epi64(temp_rd_cost_b);
     rd_cost_z = _mm256_loadu_epi64(temp_rd_cost_z);
   }
-  // Decision 0
-  if (temp_rd_cost_a[0] < decisions->rdCost[0]) {
-    decisions->rdCost[0] = temp_rd_cost_a[0];
-    decisions->absLevel[0] = pqDataA->absLevel[0];
-    decisions->prevId[0] = 0;    
-  }
-  if (temp_rd_cost_z[0] < decisions->rdCost[0]) {
-    decisions->rdCost[0] = temp_rd_cost_z[0];
-    decisions->absLevel[0] = 0;
-    decisions->prevId[0] = 0;    
-  }
-  if (temp_rd_cost_b[1] < decisions->rdCost[0]) {
-    decisions->rdCost[0] = temp_rd_cost_b[1];
-    decisions->absLevel[0] = pqDataA->absLevel[2];
-    decisions->prevId[0] = 1;    
-  }
+  rd_cost_a = _mm256_permute4x64_epi64(rd_cost_a, 216);
+  rd_cost_b = _mm256_permute4x64_epi64(rd_cost_b, 141);
+  rd_cost_z = _mm256_permute4x64_epi64(rd_cost_z, 216);
+  __m256i rd_cost_decision = _mm256_loadu_epi64(decisions->rdCost);
 
-  // Decision 2
-  if (temp_rd_cost_a[1] < decisions->rdCost[2]) {
-    decisions->rdCost[2] = temp_rd_cost_a[1];
-    decisions->absLevel[2] = pqDataA->absLevel[0];
-    decisions->prevId[2] =1;    
-  }
-  if (temp_rd_cost_z[1] < decisions->rdCost[2]) {
-    decisions->rdCost[2] = temp_rd_cost_z[1];
-    decisions->absLevel[2] = 0;
-    decisions->prevId[2] = 1;    
-  }
-  if (temp_rd_cost_b[0] < decisions->rdCost[2]) {
-    decisions->rdCost[2] = temp_rd_cost_b[0];
-    decisions->absLevel[2] = pqDataA->absLevel[2];
-    decisions->prevId[2] = 0;    
-  }
+  __m256i decision_abs_coeff = _mm256_loadu_epi32(decisions->absLevel);
+  __m256i decision_prev_state = _mm256_loadu_epi32(decisions->prevId);
+  __m256i decision_data = _mm256_permute2x128_si256(decision_abs_coeff, decision_prev_state, 0x20);
+  __m256i mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
+  decision_data = _mm256_permutevar8x32_epi32(decision_data, mask);
 
-  // Decision 1
-  if (temp_rd_cost_a[2] < decisions->rdCost[1]) {
-    decisions->rdCost[1] = temp_rd_cost_a[2];
-    decisions->absLevel[1] = pqDataA->absLevel[3];
-    decisions->prevId[1] = 2;    
-  }
-  if (temp_rd_cost_z[2] < decisions->rdCost[1]) {
-    decisions->rdCost[1] = temp_rd_cost_z[2];
-    decisions->absLevel[1] = 0;
-    decisions->prevId[1] = 2;    
-  }
-  if (temp_rd_cost_b[3] < decisions->rdCost[1]) {
-    decisions->rdCost[1] = temp_rd_cost_b[3];
-    decisions->absLevel[1] = pqDataA->absLevel[1];
-    decisions->prevId[1] = 3;    
-  }
+  __m256i a_data = _mm256_set_epi32(3, pqDataA->absLevel[3], 1, pqDataA->absLevel[0], 2, pqDataA->absLevel[3], 0, pqDataA->absLevel[0]);
+  __m256i b_data = _mm256_set_epi32(2, pqDataA->absLevel[1], 0, pqDataA->absLevel[2], 3, pqDataA->absLevel[1], 1, pqDataA->absLevel[2]);
+  __m256i z_data = _mm256_set_epi32(3, 0, 1, 0, 2, 0, 0, 0);
 
-  // Decision 3
-  if (temp_rd_cost_a[3] < decisions->rdCost[3]) {
-    decisions->rdCost[3] = temp_rd_cost_a[3];
-    decisions->absLevel[3] = pqDataA->absLevel[3];
-    decisions->prevId[3] = 3;    
-  }
-  if (temp_rd_cost_z[3] < decisions->rdCost[3]) {
-    decisions->rdCost[3] = temp_rd_cost_z[3];
-    decisions->absLevel[3] = 0;
-    decisions->prevId[3] = 3;    
-  }
-  if (temp_rd_cost_b[2] < decisions->rdCost[3]) {
-    decisions->rdCost[3] = temp_rd_cost_b[2];
-    decisions->absLevel[3] = pqDataA->absLevel[1];
-    decisions->prevId[3] = 2;    
-  }
+  __m256i a_vs_b = _mm256_cmpgt_epi64(rd_cost_a, rd_cost_b);
+  __m256i cheaper_first = _mm256_blendv_epi8(rd_cost_a, rd_cost_b, a_vs_b);
+  __m256i cheaper_first_data = _mm256_blendv_epi8(a_data, b_data, a_vs_b);
+
+  __m256i z_vs_decision = _mm256_cmpgt_epi64(rd_cost_z, rd_cost_decision);
+  __m256i cheaper_second = _mm256_blendv_epi8(rd_cost_z, rd_cost_decision, z_vs_decision);
+  __m256i cheaper_second_data = _mm256_blendv_epi8(z_data, decision_data, z_vs_decision);
+
+  __m256i final_decision = _mm256_cmpgt_epi64(cheaper_first, cheaper_second);
+  __m256i final_rd_cost = _mm256_blendv_epi8(cheaper_first, cheaper_second, final_decision);
+  __m256i final_data = _mm256_blendv_epi8(cheaper_first_data, cheaper_second_data, final_decision);
+
+  _mm256_storeu_epi64(decisions->rdCost, final_rd_cost);
+  final_data = _mm256_permutevar8x32_epi32(final_data, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
+  _mm256_storeu2_m128i(decisions->prevId, decisions->absLevel, final_data);
 }
 
 
@@ -1310,8 +1273,7 @@ static INLINE void update_states_avx2(
     bool    rem_reg_all_gte_4 = true;
     bool    rem_reg_all_lt4 = true;
 
-    __m128i abs_level = _mm_loadu_epi16(decisions->absLevel);
-    abs_level = _mm_cvtepi16_epi32(abs_level);
+    __m128i abs_level = _mm_loadu_epi32(decisions->absLevel);
     if (all_non_negative) {
       __m128i prv_states = _mm_loadu_epi32(decisions->prevId);
       __m128i prev_offset = _mm_set1_epi32(ctxs->m_prev_state_offset);
@@ -1972,8 +1934,8 @@ static void xDecideAndUpdate(
       updateStateEOS(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions, 1);
       updateStateEOS(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions, 2);
       updateStateEOS(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions, 3);
-      memcpy(decisions->prevId + 4, decisions->prevId, 4 * sizeof(int));
-      memcpy(decisions->absLevel + 4, decisions->absLevel, 4 * sizeof(coeff_t));
+      memcpy(decisions->prevId + 4, decisions->prevId, 4 * sizeof(int32_t));
+      memcpy(decisions->absLevel + 4, decisions->absLevel, 4 * sizeof(int32_t));
       memcpy(decisions->rdCost + 4, decisions->rdCost, 4 * sizeof(int64_t));
     } else if (!zeroOut) {
       update_states_avx2(ctxs, next_nb_info_ssb.num, scan_pos, decisions, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], next_nb_info_ssb, 4, false);

From 9f69713c242003fd0ffbfe52ece9b3598f876f8a Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Fri, 14 Apr 2023 08:25:33 +0300
Subject: [PATCH 217/254] [depquant] remove an unnecessary memcpy

---
 src/dep_quant.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/dep_quant.c b/src/dep_quant.c
index b6158f68..c98ab408 100644
--- a/src/dep_quant.c
+++ b/src/dep_quant.c
@@ -1150,7 +1150,7 @@ static INLINE void update_common_context(
   ctxs->m_allStates.m_sbbFracBits[curr_state][0] = cc->m_sbbFlagBits[sigNSbb][0];
   ctxs->m_allStates.m_sbbFracBits[curr_state][1] = cc->m_sbbFlagBits[sigNSbb][1];
 
-  uint16_t templateCtxInit[16];
+  uint16_t *templateCtxInit = ctxs->m_allStates.m_absLevelsAndCtxInit[curr_state] + 8;
   const int scanBeg = scan_pos - 16;
   const NbInfoOut* nbOut = cc->m_nbInfo + scanBeg;
   const uint8_t* absLevels = levels + scanBeg;
@@ -1179,7 +1179,6 @@ static INLINE void update_common_context(
     }
   }
   memset(ctxs->m_allStates.m_absLevelsAndCtxInit[curr_state], 0, 16 * sizeof(uint8_t));
-  memcpy(ctxs->m_allStates.m_absLevelsAndCtxInit[curr_state] + 8, templateCtxInit, 16 * sizeof(uint16_t));
 }
 
 

From c56350b8d6e3feebcd1e9218e5d42e37d5f45c7f Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Fri, 14 Apr 2023 09:55:09 +0300
Subject: [PATCH 218/254] [avx2] and last

---
 src/dep_quant.c | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/src/dep_quant.c b/src/dep_quant.c
index c98ab408..d01b9da6 100644
--- a/src/dep_quant.c
+++ b/src/dep_quant.c
@@ -1385,7 +1385,7 @@ static INLINE void update_states_avx2(
       }
     }
     uint32_t level_offset = scan_pos & 15;
-    __m128i   max_abs = _mm_min_epi32(abs_level, _mm_set1_epi32(255));
+    __m128i   max_abs = _mm_min_epi32(abs_level, _mm_set1_epi32(32));
     uint32_t max_abs_s[4];
     _mm_storeu_epi32(max_abs_s, max_abs);
     for (int i = 0; i < 4; ++i) {
@@ -1527,6 +1527,7 @@ static INLINE void update_states_avx2(
       }
 
       __m128i sum_abs = _mm_srli_epi32(tinit, 8);
+      sum_abs = _mm_min_epi32(sum_abs, _mm_set1_epi32(32));
       switch (numIPos) {
         case 5:
           {
@@ -1534,8 +1535,7 @@ static INLINE void update_states_avx2(
             levels,
             _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[4])),
             1);
-            t = _mm_and_epi32(t, first_byte);
-          sum_abs = _mm_add_epi32(sum_abs, t);
+          sum_abs = _mm_add_epi32(t, sum_abs);
           }
         case 4:
           {
@@ -1543,8 +1543,7 @@ static INLINE void update_states_avx2(
             levels,
             _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[3])),
             1);
-            t = _mm_and_epi32(t, first_byte);
-          sum_abs = _mm_add_epi32(sum_abs, t);
+          sum_abs = _mm_add_epi32(t, sum_abs);
           }
         case 3:
           {
@@ -1552,8 +1551,7 @@ static INLINE void update_states_avx2(
             levels,
             _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[2])),
             1);
-            t = _mm_and_epi32(t, first_byte);
-          sum_abs = _mm_add_epi32(sum_abs, t);
+          sum_abs = _mm_add_epi32(t, sum_abs);
           }
         case 2:
           {
@@ -1561,8 +1559,7 @@ static INLINE void update_states_avx2(
             levels,
             _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[1])),
             1);
-            t = _mm_and_epi32(t, first_byte);
-          sum_abs = _mm_add_epi32(sum_abs, t);
+          sum_abs = _mm_add_epi32(t, sum_abs);
           }
         case 1:
           {
@@ -1570,12 +1567,12 @@ static INLINE void update_states_avx2(
             levels,
             _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[0])),
             1);
-            t = _mm_and_epi32(t, first_byte);
-          sum_abs = _mm_add_epi32(sum_abs, t);
+          sum_abs = _mm_add_epi32(t, sum_abs);
           } break;
         default:
           assert(0);
       }
+      sum_abs = _mm_and_epi32(sum_abs, first_byte);
       if (extRiceFlag) {
         assert(0 && "Not implemented for avx2");
       } else {
@@ -1815,7 +1812,7 @@ static INLINE void updateState(
     state->all_gte_four &= state->m_remRegBins[state_id] >= 4;
     state->all_lt_four &= state->m_remRegBins[state_id] < 4;
     uint8_t* levels = (uint8_t*)(state->m_absLevelsAndCtxInit[state_id]);
-    levels[scan_pos & 15] = (uint8_t)MIN(255, decisions->absLevel[decision_id]);
+    levels[scan_pos & 15] = (uint8_t)MIN(32, decisions->absLevel[decision_id]);
 
     if (state->m_remRegBins[state_id] >= 4) {
       coeff_t tinit = state->m_absLevelsAndCtxInit[state_id][8 + ((scan_pos - 1) & 15)];

From 9e27b4056a42b6dbc18ab29f850d48c681450288 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Mon, 17 Apr 2023 13:52:42 +0300
Subject: [PATCH 219/254] [avx2] WIP update_state_eos_avx2

---
 src/dep_quant.c | 353 +++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 337 insertions(+), 16 deletions(-)

diff --git a/src/dep_quant.c b/src/dep_quant.c
index d01b9da6..6ea82fef 100644
--- a/src/dep_quant.c
+++ b/src/dep_quant.c
@@ -107,9 +107,11 @@ typedef struct
   const NbInfoOut* m_nbInfo;
   uint32_t m_sbbFlagBits[2][2];
   SbbCtx m_allSbbCtx[8];
-  SbbCtx* m_currSbbCtx;
-  SbbCtx* m_prevSbbCtx;
-  uint8_t m_memory[8 * (TR_MAX_WIDTH * TR_MAX_WIDTH + 1024)];
+  int m_curr_sbb_ctx_offset;
+  int m_prev_sbb_ctx_offset;
+  uint8_t sbb_memory[8 * 1024];
+  uint8_t level_memory[8* TR_MAX_WIDTH * TR_MAX_WIDTH];
+  int num_coeff;
 } common_context;
 
 
@@ -447,14 +449,15 @@ static void reset_common_context(common_context* ctx, const rate_estimator * rat
 {
   //memset(&ctx->m_nbInfo, 0, sizeof(ctx->m_nbInfo));
   memcpy(&ctx->m_sbbFlagBits, &rate_estimator->m_sigSbbFracBits, sizeof(rate_estimator->m_sigSbbFracBits));
-  const int chunkSize = numSbb + num_coeff;
-  uint8_t*  nextMem   = ctx->m_memory;
-  for (int k = 0; k < 8; k++, nextMem += chunkSize) {
-    ctx->m_allSbbCtx[k].sbbFlags = nextMem;
-    ctx->m_allSbbCtx[k].levels   = nextMem + numSbb;
+  uint8_t*  next_sbb_memory   = ctx->sbb_memory;
+  uint8_t*  next_level_memory   = ctx->level_memory;
+  for (int k = 0; k < 8; k++, next_sbb_memory += numSbb, next_level_memory += num_coeff) {
+    ctx->m_allSbbCtx[k].sbbFlags = next_sbb_memory;
+    ctx->m_allSbbCtx[k].levels = next_level_memory;
   }
-  ctx->m_currSbbCtx = &ctx->m_allSbbCtx[0];
-  ctx->m_prevSbbCtx = &ctx->m_allSbbCtx[4];
+  ctx->m_curr_sbb_ctx_offset = 0;
+  ctx->m_prev_sbb_ctx_offset = 4;
+  ctx->num_coeff = num_coeff;
 }
 
 static void init_rate_esimator(rate_estimator * rate_estimator, const cabac_data_t * const ctx, color_t color)
@@ -1121,12 +1124,12 @@ static INLINE void update_common_context(
   const int        curr_state)
 {
   const uint32_t numSbb = width_in_sbb * height_in_sbb;
-  uint8_t* sbbFlags = cc->m_currSbbCtx[curr_state & 3].sbbFlags;
-  uint8_t* levels = cc->m_currSbbCtx[curr_state & 3].levels;
+  uint8_t* sbbFlags = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset + (curr_state & 3)].sbbFlags;
+  uint8_t* levels = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset + (curr_state & 3)].levels;
   size_t setCpSize = cc->m_nbInfo[scan_pos - 1].maxDist * sizeof(uint8_t);
   if (prev_state != -1 && ctxs->m_allStates.m_refSbbCtxId[prev_state] >= 0) {
-    memcpy(sbbFlags, cc->m_prevSbbCtx[ctxs->m_allStates.m_refSbbCtxId[prev_state]].sbbFlags, numSbb * sizeof(uint8_t));
-    memcpy(levels + scan_pos, cc->m_prevSbbCtx[ctxs->m_allStates.m_refSbbCtxId[prev_state]].levels + scan_pos, setCpSize);
+    memcpy(sbbFlags, cc->m_allSbbCtx[cc->m_prev_sbb_ctx_offset + ctxs->m_allStates.m_refSbbCtxId[prev_state]].sbbFlags, numSbb * sizeof(uint8_t));
+    memcpy(levels + scan_pos, cc->m_allSbbCtx[cc->m_prev_sbb_ctx_offset + ctxs->m_allStates.m_refSbbCtxId[prev_state]].levels + scan_pos, setCpSize);
   }
   else {
     memset(sbbFlags, 0, numSbb * sizeof(uint8_t));
@@ -1181,6 +1184,323 @@ static INLINE void update_common_context(
   memset(ctxs->m_allStates.m_absLevelsAndCtxInit[curr_state], 0, 16 * sizeof(uint8_t));
 }
 
+static INLINE void updateStateEOS(
+  context_store*  ctxs,
+  const uint32_t  scan_pos,
+  const uint32_t  cg_pos,
+  const uint32_t  sigCtxOffsetNext,
+  const uint32_t  gtxCtxOffsetNext,
+  const uint32_t  width_in_sbb,
+  const uint32_t  height_in_sbb,
+  const uint32_t  next_sbb_right,
+  const uint32_t  next_sbb_below,
+  const Decision* decisions,
+  int             decision_id);
+
+static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos, const uint32_t cg_pos,
+                                  const uint32_t sigCtxOffsetNext, const uint32_t gtxCtxOffsetNext,
+                                  const uint32_t width_in_sbb, const uint32_t height_in_sbb,
+                                  const uint32_t next_sbb_right, const uint32_t next_sbb_below,
+                                  const Decision* decisions)
+{
+  all_depquant_states* state = &ctxs->m_allStates;
+  bool all_above_minus_two = true;
+  bool all_between_zero_and_three = true;
+  bool all_above_four = true;
+
+  
+  int state_offset = ctxs->m_curr_state_offset;
+  __m256i rd_cost = _mm256_loadu_epi64(decisions->rdCost);
+  _mm256_storeu_epi64(&ctxs->m_allStates.m_rdCost[state_offset], rd_cost);
+  for (int i = 0; i < 4; ++i) {
+    all_above_minus_two &= decisions->prevId[i] > -2;
+    all_between_zero_and_three &= decisions->prevId[i] >= 0 && decisions->prevId[i] < 4;
+    all_above_four &= decisions->prevId[i] >= 4;
+  }
+  if (all_above_minus_two) {
+    bool all_have_previous_state = true;
+    __m128i prev_state;
+    __m128i abs_level = _mm_loadu_epi32(decisions->absLevel);
+    if (all_above_four) {
+      prev_state = _mm_set1_epi32(ctxs->m_skip_state_offset);
+      prev_state = _mm_add_epi32(
+        prev_state,
+        _mm_sub_epi32(
+          _mm_loadu_epi32(decisions->prevId),
+          _mm_set1_epi32(4)
+        )
+      );
+      memset(&state->m_numSigSbb[state_offset], 0, 4);
+      for (int i = 0; i < 4; ++i) {
+        memset(state->m_absLevelsAndCtxInit[state_offset + i], 0, 16 * sizeof(uint8_t));    
+      }
+    } else if (all_between_zero_and_three) {
+      prev_state = _mm_set1_epi32(ctxs->m_skip_state_offset);
+      prev_state = _mm_add_epi32(
+        prev_state,
+        _mm_sub_epi32(
+          _mm_loadu_epi32(decisions->prevId),
+          _mm_set1_epi32(4)
+        )
+      );
+      __m128i num_sig_sbb = _mm_i32gather_epi32(&state->m_numSigSbb[state_offset], prev_state, 1);
+      num_sig_sbb = _mm_and_epi32(num_sig_sbb, _mm_set1_epi32(0xff));
+      num_sig_sbb = _mm_and_epi32(
+        num_sig_sbb,
+        _mm_max_epi32(abs_level, _mm_set1_epi32(1))
+      );
+
+            __m128i control = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+      num_sig_sbb = _mm_shuffle_epi8(num_sig_sbb, control);
+      int num_sig_sbb_s = _mm_extract_epi32(num_sig_sbb, 0);
+      memcpy(&state->m_refSbbCtxId[state_offset], &num_sig_sbb_s, 4);
+
+      int32_t prev_state_scalar[4];
+      _mm_storeu_epi32(prev_state_scalar, prev_state);
+      for (int i = 0; i < 4; ++i) {
+        memcpy(state->m_absLevelsAndCtxInit[state_offset + i], state->m_absLevelsAndCtxInit[prev_state_scalar[i]], 16 * sizeof(uint8_t));
+      }
+    } else {
+      int prev_state_s[4] = {-1, -1, -1, -1};
+      for (int i = 0; i < 4; ++i) {
+        const int decision_id = i;
+        const int curr_state_offset = state_offset + i;
+        if (decisions->prevId[decision_id] >= 4) {
+          prev_state_s[i] = ctxs->m_skip_state_offset + (decisions->prevId[decision_id] - 4);
+          state->m_numSigSbb[curr_state_offset] = 0;
+          memset(state->m_absLevelsAndCtxInit[curr_state_offset], 0, 16 * sizeof(uint8_t));
+        } else if (decisions->prevId[decision_id] >= 0) {
+          prev_state_s[i] = ctxs->m_prev_state_offset + decisions->prevId[decision_id];
+          state->m_numSigSbb[curr_state_offset] = state->m_numSigSbb[prev_state_s[i]] + !!decisions->absLevel[decision_id];
+          memcpy(state->m_absLevelsAndCtxInit[curr_state_offset], state->m_absLevelsAndCtxInit[prev_state_s[i]], 16 * sizeof(uint8_t));
+        } else {
+          state->m_numSigSbb[curr_state_offset] = 1;
+          memset(state->m_absLevelsAndCtxInit[curr_state_offset], 0, 16 * sizeof(uint8_t));
+          all_have_previous_state = false;
+        }
+      }
+      prev_state = _mm_loadu_epi32(prev_state_s);
+    }
+    uint32_t level_offset = scan_pos & 15;
+    __m128i  max_abs = _mm_min_epi32(abs_level, _mm_set1_epi32(32));
+    uint32_t max_abs_s[4];
+    _mm_storeu_epi32(max_abs_s, max_abs);
+    for (int i = 0; i < 4; ++i) {
+      uint8_t* levels = (uint8_t*)state->m_absLevelsAndCtxInit[state_offset + i];
+      levels[level_offset] = max_abs_s[i];
+    }
+
+    // Update common context
+    __m128i last;
+    {
+      const uint32_t numSbb = width_in_sbb * height_in_sbb;
+      common_context* cc = &ctxs->m_common_context;
+      size_t         setCpSize = cc->m_nbInfo[scan_pos - 1].maxDist * sizeof(uint8_t);
+      int previous_state_array[4];
+      _mm_storeu_epi32(previous_state_array, prev_state);
+      for (int curr_state = 0; curr_state < 4; ++curr_state) {
+        uint8_t* sbbFlags = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset + (curr_state)].sbbFlags;
+        uint8_t* levels = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset + (curr_state)].levels;
+        const int p_state = previous_state_array[curr_state];
+        if (p_state != -1 && ctxs->m_allStates.m_refSbbCtxId[p_state] >= 0) {
+          const int prev_sbb = cc->m_prev_sbb_ctx_offset + ctxs->m_allStates.m_refSbbCtxId[p_state];
+          memcpy(sbbFlags, cc->m_allSbbCtx[prev_sbb].sbbFlags, numSbb * sizeof(uint8_t));
+          memcpy(levels + scan_pos, cc->m_allSbbCtx[prev_sbb].levels + scan_pos, setCpSize);
+        } else {
+          memset(sbbFlags, 0, numSbb * sizeof(uint8_t));
+          memset(levels + scan_pos, 0, setCpSize);
+        }
+        sbbFlags[cg_pos] = !!ctxs->m_allStates.m_numSigSbb[curr_state + state_offset];
+        memcpy(levels + scan_pos, ctxs->m_allStates.m_absLevelsAndCtxInit[curr_state + state_offset], 16 * sizeof(uint8_t));
+      }
+
+      __m128i sbb_offsets = _mm_set_epi32(3 * numSbb, 2 * numSbb, 1 * numSbb, 0);
+      __m128i next_sbb_right_m = _mm_set1_epi32(next_sbb_right);
+      __m128i sbb_offsets_right = _mm_add_epi32(sbb_offsets, next_sbb_right_m);
+      __m128i sbb_right = next_sbb_right ? _mm_i32gather_epi32(&cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].sbbFlags, sbb_offsets_right, 1) : _mm_set1_epi32(0);
+
+      __m128i sbb_offsets_below = _mm_add_epi32(sbb_offsets, _mm_set1_epi32(next_sbb_below));
+      __m128i sbb_below = next_sbb_right ? _mm_i32gather_epi32(&cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].sbbFlags, sbb_offsets_below, 1) : _mm_set1_epi32(0);
+
+      __m128i sig_sbb = _mm_or_epi32(sbb_right, sbb_below);
+      sig_sbb = _mm_max_epi32(sig_sbb, _mm_set1_epi32(1));
+      __m256i sbb_frac_bits = _mm256_i32gather_epi64(cc->m_sbbFlagBits, sig_sbb, 8);
+      _mm256_storeu_epi64(state->m_sbbFracBits[state_offset], sbb_frac_bits);
+
+      memset(&state->m_numSigSbb[state_offset], 0, 4);
+      memset(&state->m_goRicePar[state_offset], 0, 4);
+
+      uint8_t states[4] = {0, 1, 2, 3};
+      memcpy(&state->m_refSbbCtxId[state_offset], states, 4);
+      if (all_have_previous_state) {
+        __m128i rem_reg_bins = _mm_i32gather_epi32(state->m_remRegBins, prev_state, 4);
+        _mm_storeu_epi32(&state->m_remRegBins[state_offset], rem_reg_bins);
+      } else {
+        const int temp = (state->effWidth * state->effHeight * 28) / 16;
+        for (int i = 0; i < 4; ++i) {
+          if (previous_state_array[i] != -1) {
+            state->m_remRegBins[i + state_offset] = state->m_remRegBins[previous_state_array[i]];
+          } else {
+            state->m_remRegBins[i + state_offset] = temp;
+          }
+        }
+      }
+      
+      const int        scanBeg = scan_pos - 16;
+      const NbInfoOut* nbOut = cc->m_nbInfo + scanBeg;
+      const uint8_t*   absLevels = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].levels + scanBeg;
+
+      __m128i          levels_offsets = _mm_set_epi32(cc->num_coeff * 3, cc->num_coeff * 2, cc->num_coeff * 1, 0);
+      __m128i          first_byte = _mm_set1_epi32(0xff);
+      __m128i          ones = _mm_set1_epi32(1);
+      __m128i         fours = _mm_set1_epi32(4);
+      __m256i          all[4];
+      uint64_t         temp[4];
+      for (int id = 0; id < 16; id++, nbOut++) {
+        if (nbOut->num == 0) {
+          temp[id % 4] = 0;
+          if (id % 4 == 3) {
+            all[0] = _mm256_loadu_epi64(temp);
+          }
+          continue;
+        }
+        __m128i sum_abs = _mm_set1_epi32(0);
+        __m128i sum_abs_1 = _mm_set1_epi32(0);
+        __m128i sum_num = _mm_set1_epi32(0);
+        switch (nbOut->num) {
+        case 5:
+          {
+            __m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[4]));
+            __m128i t = _mm_i32gather_epi32(absLevels, offset, 1);
+            t = _mm_and_epi32(t, first_byte);
+            sum_abs = _mm_add_epi32(sum_abs, t);
+            sum_num = _mm_add_epi32(sum_num, _mm_max_epi32(t, ones));
+            __m128i min_t = _mm_min_epi32(
+              t,
+              _mm_add_epi32(
+                fours,
+                _mm_and_epi32(t, ones)
+              )
+            );
+            sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t);
+          }
+        case 4: {
+            __m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[3]));
+            __m128i t = _mm_i32gather_epi32(absLevels, offset, 1);
+            t = _mm_and_epi32(t, first_byte);
+            sum_abs = _mm_add_epi32(sum_abs, t);
+            sum_num = _mm_add_epi32(sum_num, _mm_max_epi32(t, ones));
+            __m128i min_t = _mm_min_epi32(
+              t,
+              _mm_add_epi32(
+                fours,
+                _mm_and_epi32(t, ones)));
+            sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t);
+        }
+        case 3: {
+            __m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[2]));
+            __m128i t = _mm_i32gather_epi32(absLevels, offset, 1);
+            t = _mm_and_epi32(t, first_byte);
+            sum_abs = _mm_add_epi32(sum_abs, t);
+            sum_num = _mm_add_epi32(sum_num, _mm_max_epi32(t, ones));
+            __m128i min_t = _mm_min_epi32(
+              t,
+              _mm_add_epi32(
+                fours,
+                _mm_and_epi32(t, ones)));
+            sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t);
+        }
+        case 2: {
+            __m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[1]));
+            __m128i t = _mm_i32gather_epi32(absLevels, offset, 1);
+            t = _mm_and_epi32(t, first_byte);
+            sum_abs = _mm_add_epi32(sum_abs, t);
+            sum_num = _mm_add_epi32(sum_num, _mm_max_epi32(t, ones));
+            __m128i min_t = _mm_min_epi32(
+              t,
+              _mm_add_epi32(
+                fours,
+                _mm_and_epi32(t, ones)));
+            sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t);
+        }
+        case 1: {
+            __m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[0]));
+            __m128i t = _mm_i32gather_epi32(absLevels, offset, 1);
+            t = _mm_and_epi32(t, first_byte);
+            sum_abs = _mm_add_epi32(sum_abs, t);
+            sum_num = _mm_add_epi32(sum_num, _mm_max_epi32(t, ones));
+            __m128i min_t = _mm_min_epi32(
+              t,
+              _mm_add_epi32(
+                fours,
+                _mm_and_epi32(t, ones)));
+            sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t);
+        }
+            break;
+        default:
+          assert(0);
+        }
+        sum_abs_1 = _mm_slli_epi32(sum_abs_1, 3);
+        sum_abs = _mm_slli_epi32(_mm_min_epi32(_mm_set1_epi32(127), sum_abs), 8);
+        __m128i template_ctx_init = _mm_add_epi32(sum_num, sum_abs);
+        _mm_add_epi32(template_ctx_init, sum_abs_1);
+        __m128i shuffle_mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 0, 0, 0, 0, 0, 0, 0, 0);
+        __m128i shuffled_template_ctx_init = _mm_shuffle_epi8(template_ctx_init, shuffle_mask);
+        temp[id % 4] = _mm_extract_epi64(shuffled_template_ctx_init, 0);
+        if (id %4 == 3) {
+          all[0] = _mm256_loadu_epi64(temp);
+          last = template_ctx_init;
+        }
+      }
+
+      for (int i = 0; i < 4; ++i) {
+        memset(state->m_absLevelsAndCtxInit[state_offset + i], 0, 16);
+      }
+    }
+
+    __m128i sum_num = _mm_and_epi32(last, _mm_set1_epi32(7));
+    __m128i sum_abs1 = _mm_and_epi32(
+      _mm_srli_epi32(last, 3),
+      _mm_set1_epi32(31));
+
+    __m128i sum_abs_min = _mm_min_epi32(
+      _mm_set1_epi32(3),
+      _mm_srli_epi32(
+        _mm_add_epi32(sum_abs1, _mm_set1_epi32(1)),
+        1));
+
+    __m128i offsets = _mm_set_epi32(12 * 3, 12 * 2, 12 * 1, 12 * 0);
+    offsets = _mm_add_epi32(offsets, _mm_set1_epi32(sigCtxOffsetNext));
+    __m256i sig_frac_bits = _mm256_i32gather_epi64(state->m_sigFracBitsArray[state_offset][0], offsets, 8);
+    _mm256_storeu_epi64(&state->m_sigFracBits[state_offset][0], sig_frac_bits);
+
+
+    __m128i sum_gt1 = _mm_sub_epi32(sum_abs1, sum_num);
+    __m128i min_gt1 = _mm_min_epi32(sum_gt1, _mm_set1_epi32(4));
+    uint32_t sum_gt1_s[4];
+    _mm_storeu_epi32(sum_gt1_s, min_gt1);
+    for (int i = 0; i < 4; ++i) {
+      memcpy(state->m_coeffFracBits[state_offset + i], state->m_gtxFracBitsArray[sum_gt1_s[i]], sizeof(state->m_coeffFracBits[0]));
+    }
+  }
+  else {
+    for (int i = 0; i < 4; i++) {
+      updateStateEOS(
+        ctxs,
+        scan_pos,
+        cg_pos,
+        sigCtxOffsetNext,
+        gtxCtxOffsetNext,
+        width_in_sbb,
+        height_in_sbb,
+        next_sbb_right,
+        next_sbb_below,
+        decisions,
+        i);
+    }
+  }
+}
+
 
 static INLINE void updateStateEOS(
   context_store*   ctxs,
@@ -1215,7 +1535,7 @@ static INLINE void updateStateEOS(
       memset(state->m_absLevelsAndCtxInit[curr_state_offset], 0, 16 * sizeof(uint8_t));
     }
     uint8_t* temp = (uint8_t*)(&state->m_absLevelsAndCtxInit[curr_state_offset][scan_pos & 15]);
-    *temp = (uint8_t)MIN(255, decisions->absLevel[decision_id]);
+    *temp = (uint8_t)MIN(32, decisions->absLevel[decision_id]);
 
     update_common_context(ctxs, state->m_commonCtx, scan_pos, cg_pos, width_in_sbb, height_in_sbb, next_sbb_right,
                           next_sbb_below, prvState, ctxs->m_curr_state_offset + decision_id);
@@ -1925,7 +2245,7 @@ static void xDecideAndUpdate(
 
   if (scan_pos) {
     if (!(scan_pos & 15)) {
-      SWAP(ctxs->m_common_context.m_currSbbCtx, ctxs->m_common_context.m_prevSbbCtx, SbbCtx*);
+      SWAP(ctxs->m_common_context.m_curr_sbb_ctx_offset, ctxs->m_common_context.m_prev_sbb_ctx_offset, int);
       updateStateEOS(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions, 0);
       updateStateEOS(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions, 1);
       updateStateEOS(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions, 2);
@@ -1933,6 +2253,7 @@ static void xDecideAndUpdate(
       memcpy(decisions->prevId + 4, decisions->prevId, 4 * sizeof(int32_t));
       memcpy(decisions->absLevel + 4, decisions->absLevel, 4 * sizeof(int32_t));
       memcpy(decisions->rdCost + 4, decisions->rdCost, 4 * sizeof(int64_t));
+      printf("\n");
     } else if (!zeroOut) {
       update_states_avx2(ctxs, next_nb_info_ssb.num, scan_pos, decisions, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], next_nb_info_ssb, 4, false);
     /*  updateState(ctxs, next_nb_info_ssb.num, scan_pos, decisions, sigCtxOffsetNext, gtxCtxOffsetNext, next_nb_info_ssb, 4, false, 0);

From 00f838306f416d099b8527607331fc880aa8ae37 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Mon, 17 Apr 2023 14:18:57 +0300
Subject: [PATCH 220/254] [depquant] Initialize quant_block only when necessary

---
 src/dep_quant.c    | 44 +++++++++++++++++++-------------------------
 src/dep_quant.h    | 16 ++++++++++++++++
 src/encoderstate.h |  2 ++
 src/intra.c        |  1 +
 src/search_intra.c |  5 +++--
 5 files changed, 41 insertions(+), 27 deletions(-)

diff --git a/src/dep_quant.c b/src/dep_quant.c
index 6ea82fef..96002664 100644
--- a/src/dep_quant.c
+++ b/src/dep_quant.c
@@ -65,20 +65,7 @@ static const uint32_t g_goRiceParsCoeff[32] = { 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
 
 enum ScanPosType { SCAN_ISCSBB = 0, SCAN_SOCSBB = 1, SCAN_EOCSBB = 2 };
 
-typedef struct
-{
-  int m_QShift;
-  int64_t m_QAdd;
-  int64_t m_QScale;
-  int64_t m_maxQIdx;
-  int64_t m_thresLast;
-  int64_t m_thresSSbb;
-  // distortion normalization
-  int m_DistShift;
-  int64_t m_DistAdd;
-  int64_t m_DistStepAdd;
-  int64_t m_DistOrgFact;
-} quant_block;
+
 
 
 typedef struct
@@ -172,13 +159,13 @@ typedef struct
 
 typedef struct
 {
-    common_context   m_common_context;
+    common_context  m_common_context;
     all_depquant_states m_allStates;
     int m_curr_state_offset;
     int m_prev_state_offset;
     int m_skip_state_offset;
     depquant_state       m_startState;
-    quant_block   m_quant;
+    quant_block*   m_quant;
     Decision    m_trellis[TR_MAX_WIDTH * TR_MAX_WIDTH];
 } context_store;
 
@@ -443,6 +430,7 @@ static void init_quant_block(
   qp->m_DistAdd = ((int64_t)(1) << qp->m_DistShift) >> 1;
   qp->m_DistStepAdd = (int64_t)(nomDistFactor * (double)((int64_t)(1) << (qp->m_DistShift + qp->m_QShift)) + .5);
   qp->m_DistOrgFact = (int64_t)(nomDistFactor * (double)((int64_t)(1) << (qp->m_DistShift + 1)) + .5);
+  qp->needs_init = false;
 }
 
 static void reset_common_context(common_context* ctx, const rate_estimator * rate_estimator, int numSbb, int num_coeff)
@@ -2241,7 +2229,7 @@ static void xDecideAndUpdate(
     spt = SCAN_EOCSBB;
   }
 
-  xDecide(&ctxs->m_allStates, &ctxs->m_startState, &ctxs->m_quant, spt, absCoeff, re->m_lastBitsX[scan_info->pos_x] + re->m_lastBitsY[scan_info->pos_y], decisions, zeroOut, quantCoeff,ctxs->m_skip_state_offset, ctxs->m_prev_state_offset);
+  xDecide(&ctxs->m_allStates, &ctxs->m_startState, ctxs->m_quant, spt, absCoeff, re->m_lastBitsX[scan_info->pos_x] + re->m_lastBitsY[scan_info->pos_y], decisions, zeroOut, quantCoeff,ctxs->m_skip_state_offset, ctxs->m_prev_state_offset);
 
   if (scan_pos) {
     if (!(scan_pos & 15)) {
@@ -2313,11 +2301,17 @@ int uvg_dep_quant(
 
   const int32_t scalinglist_type = (cur_tu->type == CU_INTRA ? 0 : 3) + (int8_t)compID;
   const int32_t *q_coeff = encoder->scaling_list.quant_coeff[log2_tr_width][log2_tr_height][scalinglist_type][qp_scaled % 6];
-  const int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_height + log2_tr_width) >> 1) - needs_block_size_trafo_scale; //!< Represents scaling through forward transform
-  const int64_t q_bits = QUANT_SHIFT + qp_scaled / 6 + (is_ts ? 0 : transform_shift );
-  const int32_t add = ((state->frame->slicetype == UVG_SLICE_I) ? 171 : 85) << (q_bits - 9);
-  
-  init_quant_block(state, &dep_quant_context.m_quant, cur_tu, log2_tr_width, log2_tr_height, compID, needs_block_size_trafo_scale, -1);
+
+  if (compID != COLOR_Y) {
+    dep_quant_context.m_quant = (quant_block*)& state->quant_blocks[2];
+  } else if (cur_tu->type == CU_INTRA && cur_tu->intra.isp_mode != ISP_MODE_NO_ISP) {
+    dep_quant_context.m_quant = (quant_block*)&state->quant_blocks[1];    
+  } else {
+    dep_quant_context.m_quant = (quant_block*)&state->quant_blocks[0];   
+  }
+  if (dep_quant_context.m_quant->needs_init) {
+    init_quant_block(state, dep_quant_context.m_quant, cur_tu, log2_tr_width, log2_tr_height, compID, needs_block_size_trafo_scale, -1);
+  }
   
   //===== scaling matrix ====
   //const int         qpDQ = cQP.Qp + 1;
@@ -2345,8 +2339,8 @@ int uvg_dep_quant(
     height >= 4) {
     firstTestPos =((width == 4 && height == 4) || (width == 8 && height == 8)) ? 7 : 15;
   }
-  const int32_t default_quant_coeff = dep_quant_context.m_quant.m_QScale;
-  const int32_t thres               = dep_quant_context.m_quant.m_thresLast;
+  const int32_t default_quant_coeff = dep_quant_context.m_quant->m_QScale;
+  const int32_t thres               = dep_quant_context.m_quant->m_thresLast;
   for (; firstTestPos >= 0; firstTestPos--) {
     coeff_t thresTmp = (enableScalingLists) ? (thres / (4 * q_coeff[scan[firstTestPos]])) : (thres / (4 * default_quant_coeff));
     if (abs(srcCoeff[scan[firstTestPos]]) > thresTmp) {
@@ -2419,7 +2413,7 @@ int uvg_dep_quant(
 
     context_store* ctxs = &dep_quant_context;
     if (enableScalingLists) {
-      init_quant_block(state, &dep_quant_context.m_quant, cur_tu, log2_tr_width, log2_tr_height, compID, needs_block_size_trafo_scale, q_coeff[blkpos]);
+      init_quant_block(state, dep_quant_context.m_quant, cur_tu, log2_tr_width, log2_tr_height, compID, needs_block_size_trafo_scale, q_coeff[blkpos]);
 
       xDecideAndUpdate(
         &rate_estimator,
diff --git a/src/dep_quant.h b/src/dep_quant.h
index a8483e40..1f059119 100644
--- a/src/dep_quant.h
+++ b/src/dep_quant.h
@@ -49,6 +49,22 @@ struct dep_quant_scan_info
   uint8_t next_sbb_below;
 };
 
+typedef struct
+{
+  int     m_QShift;
+  int64_t m_QAdd;
+  int64_t m_QScale;
+  int64_t m_maxQIdx;
+  int64_t m_thresLast;
+  int64_t m_thresSSbb;
+  // distortion normalization
+  int     m_DistShift;
+  int64_t m_DistAdd;
+  int64_t m_DistStepAdd;
+  int64_t m_DistOrgFact;
+  bool    needs_init;
+} quant_block;
+
 typedef struct
 {
   uint8_t num;
diff --git a/src/encoderstate.h b/src/encoderstate.h
index 7afa78ab..f9d7d0a8 100644
--- a/src/encoderstate.h
+++ b/src/encoderstate.h
@@ -366,6 +366,8 @@ typedef struct encoder_state_t {
   // luma mode in the lfnst functions, instead store the current
   // collocated luma mode in the state.
   int8_t collocated_luma_mode;
+
+  quant_block quant_blocks[3]; // luma, ISP, chroma
 } encoder_state_t;
 
 void uvg_encode_one_frame(encoder_state_t * const state, uvg_picture* frame);
diff --git a/src/intra.c b/src/intra.c
index 1b7026e5..a1d0cf42 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -2019,6 +2019,7 @@ double uvg_recon_and_estimate_cost_isp(encoder_state_t* const state,
   int split_limit = uvg_get_isp_split_num(width, height, split_type, true);
 
   int cbf_context = 2;
+  state->quant_blocks[1].needs_init = true;
 
   for (int i = 0; i < split_limit; ++i) {
     search_data->pred_cu.intra.isp_index = i;
diff --git a/src/search_intra.c b/src/search_intra.c
index 2e507f95..17ec6747 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -361,7 +361,6 @@ static double search_intra_trdepth(
       search_data->lfnst_costs[i] = MAX_DOUBLE;
     }
 
-
     for (trafo = mts_start; trafo < num_transforms; trafo++) {
       for (int lfnst_idx = start_idx; lfnst_idx <= end_lfnst_idx; lfnst_idx++) {
         // Initialize lfnst variables
@@ -1492,6 +1491,7 @@ int8_t uvg_search_intra_chroma_rdo(
     ALIGNED(64) int16_t v_resi[LCU_WIDTH_C * LCU_WIDTH_C];
 
     double original_c_lambda = state->c_lambda;
+    state->quant_blocks[2].needs_init = true;
 
     for (int8_t mode_i = 0; mode_i < num_modes; ++mode_i) {
       const uint8_t mode = chroma_data[mode_i].pred_cu.intra.mode_chroma;
@@ -1968,7 +1968,8 @@ void uvg_search_cu_intra(
         number_of_modes_to_search++;
       }
     }
-    
+
+    state->quant_blocks[0].needs_init = 1;
     search_intra_rdo(
       state,
       number_of_modes_to_search,

From 00cc58bc5524a6810cc9e3ef67a5f013f70dda14 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Mon, 17 Apr 2023 14:45:55 +0300
Subject: [PATCH 221/254] [depquant] Only initialize rate_estimator when
 necessary

---
 src/dep_quant.c    | 56 ++++++++++++++++++----------------------------
 src/dep_quant.h    | 19 ++++++++++++++++
 src/encoderstate.h |  1 +
 src/intra.c        |  5 +++++
 src/search_intra.c |  1 +
 src/transform.c    |  5 +++++
 6 files changed, 53 insertions(+), 34 deletions(-)

diff --git a/src/dep_quant.c b/src/dep_quant.c
index 96002664..ef73d7ed 100644
--- a/src/dep_quant.c
+++ b/src/dep_quant.c
@@ -42,13 +42,6 @@
 #include <immintrin.h>
 
 
-#define sm_numCtxSetsSig 3
-#define sm_numCtxSetsGtx 2
-#define sm_maxNumSigSbbCtx 2
-#define sm_maxNumSigCtx    12
-#define sm_maxNumGtxCtx    21
-#define SCALE_BITS 15
-#define RICEMAX 32
 
 static const int32_t g_goRiceBits[4][RICEMAX] = {
     { 32768,  65536,  98304, 131072, 163840, 196608, 262144, 262144, 327680, 327680, 327680, 327680, 393216, 393216, 393216, 393216, 393216, 393216, 393216, 393216, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752},
@@ -102,16 +95,6 @@ typedef struct
 } common_context;
 
 
-typedef struct
-{
-  int32_t m_lastBitsX[TR_MAX_WIDTH];
-  int32_t m_lastBitsY[TR_MAX_WIDTH];
-  uint32_t m_sigSbbFracBits[sm_maxNumSigSbbCtx][2];
-  uint32_t m_sigFracBits[sm_numCtxSetsSig][sm_maxNumSigCtx][2];
-  int32_t m_gtxFracBits[sm_maxNumGtxCtx][6];
-} rate_estimator;
-
-
 typedef struct
 {
   int64_t m_rdCost;
@@ -451,12 +434,12 @@ static void reset_common_context(common_context* ctx, const rate_estimator * rat
 static void init_rate_esimator(rate_estimator * rate_estimator, const cabac_data_t * const ctx, color_t color)
 {
   const cabac_ctx_t * base_ctx = color == COLOR_Y ? ctx->ctx.sig_coeff_group_model : (ctx->ctx.sig_coeff_group_model + 2);
-  for (unsigned ctxId = 0; ctxId < sm_maxNumSigSbbCtx; ctxId++) {
+  for (unsigned ctxId = 0; ctxId < SM_MAX_NUM_SIG_SBB_CTX; ctxId++) {
     rate_estimator->m_sigSbbFracBits[ctxId][0] = CTX_ENTROPY_BITS(&base_ctx[ctxId], 0);
     rate_estimator->m_sigSbbFracBits[ctxId][1] = CTX_ENTROPY_BITS(&base_ctx[ctxId], 1);
   }
   unsigned numCtx = (color == COLOR_Y ? 12 : 8);
-  for (unsigned ctxSetId = 0; ctxSetId < sm_numCtxSetsSig; ctxSetId++) {
+  for (unsigned ctxSetId = 0; ctxSetId < SM_NUM_CTX_SETS_SIG; ctxSetId++) {
     base_ctx = color == COLOR_Y ? ctx->ctx.cu_sig_model_luma[ctxSetId] : ctx->ctx.cu_sig_model_chroma[ctxSetId];
     for (unsigned ctxId = 0; ctxId < numCtx; ctxId++) {
       rate_estimator->m_sigFracBits[ctxSetId][ctxId][0] = CTX_ENTROPY_BITS(&base_ctx[ctxId], 0);
@@ -2309,7 +2292,8 @@ int uvg_dep_quant(
   } else {
     dep_quant_context.m_quant = (quant_block*)&state->quant_blocks[0];   
   }
-  if (dep_quant_context.m_quant->needs_init) {
+  //TODO: no idea when it is safe not to reinit for inter
+  if (dep_quant_context.m_quant->needs_init || cur_tu->type == CU_INTER) {
     init_quant_block(state, dep_quant_context.m_quant, cur_tu, log2_tr_width, log2_tr_height, compID, needs_block_size_trafo_scale, -1);
   }
   
@@ -2352,11 +2336,15 @@ int uvg_dep_quant(
   }
 
   //===== real init =====
-  rate_estimator rate_estimator;
-  init_rate_esimator(&rate_estimator, &state->search_cabac, compID);
-  xSetLastCoeffOffset(state, cur_tu, width, height, &rate_estimator, compID);
+  rate_estimator* rate_estimator = compID == COLOR_Y && cur_tu->type == CU_INTRA && cur_tu->intra.isp_mode != ISP_MODE_NO_ISP ?
+    &state->rate_estimator[3] : &state->rate_estimator[compID];
+  if(rate_estimator->needs_init || cur_tu->type == CU_INTER) {
+    init_rate_esimator(rate_estimator, &state->search_cabac, compID);
+    xSetLastCoeffOffset(state, cur_tu, width, height, rate_estimator, compID);
+    rate_estimator->needs_init = false;
+  }
 
-  reset_common_context(&dep_quant_context.m_common_context, &rate_estimator, (width * height) >> 4, numCoeff);
+  reset_common_context(&dep_quant_context.m_common_context, rate_estimator, (width * height) >> 4, numCoeff);
   dep_quant_context.m_common_context.m_nbInfo = encoder->m_scanId2NbInfoOutArray[log2_tr_width][log2_tr_height];
   
 
@@ -2367,9 +2355,9 @@ int uvg_dep_quant(
     dep_quant_context.m_allStates.m_numSigSbb[k] = 0;
     dep_quant_context.m_allStates.m_remRegBins[k] = 4; // just large enough for last scan pos
     dep_quant_context.m_allStates.m_refSbbCtxId[k] = -1;
-    dep_quant_context.m_allStates.m_sigFracBits[k][0] = rate_estimator.m_sigFracBits[0][0][0];
-    dep_quant_context.m_allStates.m_sigFracBits[k][1] = rate_estimator.m_sigFracBits[0][0][1];
-    memcpy(dep_quant_context.m_allStates.m_coeffFracBits[k], rate_estimator.m_gtxFracBits[0], sizeof(dep_quant_context.m_allStates.m_coeffFracBits[k]));
+    dep_quant_context.m_allStates.m_sigFracBits[k][0] = rate_estimator->m_sigFracBits[0][0][0];
+    dep_quant_context.m_allStates.m_sigFracBits[k][1] = rate_estimator->m_sigFracBits[0][0][1];
+    memcpy(dep_quant_context.m_allStates.m_coeffFracBits[k], rate_estimator->m_gtxFracBits[0], sizeof(dep_quant_context.m_allStates.m_coeffFracBits[k]));
     dep_quant_context.m_allStates.m_goRicePar[k] = 0;
     dep_quant_context.m_allStates.m_goRiceZero[k] = 0;
 
@@ -2378,7 +2366,7 @@ int uvg_dep_quant(
 
     dep_quant_context.m_allStates.m_stateId[k] = k & 3;
     for (int i = 0; i < (compID == COLOR_Y ? 12 : 8); ++i) {
-      memcpy(dep_quant_context.m_allStates.m_sigFracBitsArray[k][i], rate_estimator.m_sigFracBits[(k & 3 ? (k & 3) - 1 : 0)][i], sizeof(uint32_t) * 2);
+      memcpy(dep_quant_context.m_allStates.m_sigFracBitsArray[k][i], rate_estimator->m_sigFracBits[(k & 3 ? (k & 3) - 1 : 0)][i], sizeof(uint32_t) * 2);
     }
   }
 
@@ -2388,19 +2376,19 @@ int uvg_dep_quant(
   dep_quant_context.m_allStates.all_lt_four = false;
   dep_quant_context.m_allStates.m_commonCtx = &dep_quant_context.m_common_context;
   for (int i = 0; i < (compID == COLOR_Y ? 21 : 11); ++i) {
-    memcpy(dep_quant_context.m_allStates.m_gtxFracBitsArray[i], rate_estimator.m_gtxFracBits[i], sizeof(int32_t) * 6);
+    memcpy(dep_quant_context.m_allStates.m_gtxFracBitsArray[i], rate_estimator->m_gtxFracBits[i], sizeof(int32_t) * 6);
   }
 
-  depquant_state_init(&dep_quant_context.m_startState, rate_estimator.m_sigFracBits[0][0], rate_estimator.m_gtxFracBits[0]);
+  depquant_state_init(&dep_quant_context.m_startState, rate_estimator->m_sigFracBits[0][0], rate_estimator->m_gtxFracBits[0]);
   dep_quant_context.m_startState.effHeight = effectHeight;
   dep_quant_context.m_startState.effWidth = effectWidth;
   dep_quant_context.m_startState.m_stateId = 0;
   dep_quant_context.m_startState.m_commonCtx = &dep_quant_context.m_common_context;
   for (int i = 0; i < (compID == COLOR_Y ? 12 : 8); ++i) {
-    dep_quant_context.m_startState.m_sigFracBitsArray[i] = rate_estimator.m_sigFracBits[0][i];
+    dep_quant_context.m_startState.m_sigFracBitsArray[i] = rate_estimator->m_sigFracBits[0][i];
   }
   for (int i = 0; i < (compID == COLOR_Y ? 21 : 11); ++i) {
-    dep_quant_context.m_startState.m_gtxFracBitsArray[i] = rate_estimator.m_gtxFracBits[i];
+    dep_quant_context.m_startState.m_gtxFracBitsArray[i] = rate_estimator->m_gtxFracBits[i];
   }
 
   const uint32_t height_in_sbb = MAX(height >> 2, 1);
@@ -2416,7 +2404,7 @@ int uvg_dep_quant(
       init_quant_block(state, dep_quant_context.m_quant, cur_tu, log2_tr_width, log2_tr_height, compID, needs_block_size_trafo_scale, q_coeff[blkpos]);
 
       xDecideAndUpdate(
-        &rate_estimator,
+        rate_estimator,
         ctxs,
         scan_info,
         abs(srcCoeff[blkpos]),
@@ -2433,7 +2421,7 @@ int uvg_dep_quant(
     }
     else {
       xDecideAndUpdate(
-        &rate_estimator,
+        rate_estimator,
         ctxs,
         scan_info,
         abs(srcCoeff[blkpos]),
diff --git a/src/dep_quant.h b/src/dep_quant.h
index 1f059119..a56b3941 100644
--- a/src/dep_quant.h
+++ b/src/dep_quant.h
@@ -36,6 +36,14 @@
 #include "cu.h"
 #include "global.h"
 
+#define SM_NUM_CTX_SETS_SIG   3
+#define SM_NUM_CTX_SETS_GTX   2
+#define SM_MAX_NUM_SIG_SBB_CTX 2
+#define SM_MAX_NUM_SIG_CTX    12
+#define SM_MAX_NUM_GTX_CTX    21
+#define SCALE_BITS         15
+#define RICEMAX            32
+
 typedef struct encoder_control_t encoder_control_t;
 
 struct dep_quant_scan_info
@@ -65,6 +73,17 @@ typedef struct
   bool    needs_init;
 } quant_block;
 
+typedef struct
+{
+  int32_t  m_lastBitsX[TR_MAX_WIDTH];
+  int32_t  m_lastBitsY[TR_MAX_WIDTH];
+  uint32_t m_sigSbbFracBits[SM_MAX_NUM_SIG_SBB_CTX][2];
+  uint32_t m_sigFracBits[SM_NUM_CTX_SETS_SIG][SM_MAX_NUM_SIG_CTX][2];
+  int32_t  m_gtxFracBits[SM_MAX_NUM_GTX_CTX][6];
+  bool     needs_init;
+} rate_estimator;
+
+
 typedef struct
 {
   uint8_t num;
diff --git a/src/encoderstate.h b/src/encoderstate.h
index f9d7d0a8..1779caaa 100644
--- a/src/encoderstate.h
+++ b/src/encoderstate.h
@@ -368,6 +368,7 @@ typedef struct encoder_state_t {
   int8_t collocated_luma_mode;
 
   quant_block quant_blocks[3]; // luma, ISP, chroma
+  rate_estimator rate_estimator[4]; // luma, cb, cr, isp
 } encoder_state_t;
 
 void uvg_encode_one_frame(encoder_state_t * const state, uvg_picture* frame);
diff --git a/src/intra.c b/src/intra.c
index a1d0cf42..22eb93c7 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -1908,6 +1908,8 @@ void uvg_intra_recon_cu(
     int split_type = search_data->pred_cu.intra.isp_mode;
     int split_limit = uvg_get_isp_split_num(width, height, split_type, true);
 
+    state->quant_blocks[1].needs_init = true;
+
     for (int i = 0; i < split_limit; ++i) {
       cu_loc_t tu_loc;
       uvg_get_isp_split_loc(&tu_loc,  cu_loc->x, cu_loc->y, width, height, i, split_type, true);
@@ -1917,6 +1919,7 @@ void uvg_intra_recon_cu(
       if(tu_loc.x % 4 == 0) {
         intra_recon_tb_leaf(state, &pu_loc, cu_loc, lcu, COLOR_Y, search_data);
       }
+      state->rate_estimator[3].needs_init = true;
       uvg_quantize_lcu_residual(state, true, false, false,
         &tu_loc, cur_cu, lcu,
         false, tree_type);
@@ -2030,6 +2033,8 @@ double uvg_recon_and_estimate_cost_isp(encoder_state_t* const state,
     if (tu_loc.x % 4 == 0) {
       intra_recon_tb_leaf(state, &pu_loc, cu_loc, lcu, COLOR_Y, search_data);
     }
+
+    state->rate_estimator[3].needs_init = true;
     uvg_quantize_lcu_residual(state, true, false, false,
       &tu_loc, &search_data->pred_cu, lcu,
       false, UVG_LUMA_T);
diff --git a/src/search_intra.c b/src/search_intra.c
index 17ec6747..9d4c5da6 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -1492,6 +1492,7 @@ int8_t uvg_search_intra_chroma_rdo(
 
     double original_c_lambda = state->c_lambda;
     state->quant_blocks[2].needs_init = true;
+    state->rate_estimator[1].needs_init = true;
 
     for (int8_t mode_i = 0; mode_i < num_modes; ++mode_i) {
       const uint8_t mode = chroma_data[mode_i].pred_cu.intra.mode_chroma;
diff --git a/src/transform.c b/src/transform.c
index 58051a87..5fb03abd 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -468,6 +468,7 @@ static void quantize_chroma(
 
     if (transform == DCT7_CHROMA) {
       abs_sum = 0;
+      state->rate_estimator[2].needs_init = true;
       uvg_dep_quant(
         state,
         cur_tu,
@@ -1538,6 +1539,7 @@ void uvg_quantize_lcu_residual(
     cu_loc_t split_cu_loc[4];
     uint16_t child_cbfs[3];
     const int split_count = uvg_get_split_locs(cu_loc, split, split_cu_loc,NULL);
+    
     for (int i = 0; i < split_count; ++i) {
       uvg_quantize_lcu_residual(state, luma, chroma, 0, &split_cu_loc[i], NULL, lcu, early_skip, tree_type);
       if(i != 0) {
@@ -1558,11 +1560,14 @@ void uvg_quantize_lcu_residual(
     uvg_cu_loc_ctor(&loc, x, y, width, height);
 
     if (luma) {
+      state->quant_blocks[0].needs_init = true;
+      state->rate_estimator[0].needs_init = true;
       quantize_tr_residual(state, COLOR_Y, &loc, cur_pu, lcu, early_skip, tree_type);
     }
     double c_lambda = state->c_lambda;
     state->c_lambda = uvg_calculate_chroma_lambda(state, state->encoder_control->cfg.jccr, cur_pu->joint_cb_cr);
     if (chroma) {
+      state->rate_estimator[2].needs_init = true;
       if(state->encoder_control->cfg.dep_quant) {
         cabac_data_t temp_cabac;
         memcpy(&temp_cabac, &state->search_cabac, sizeof(cabac_data_t));

From 8eb0f667344d3a7d6b2553b8a7a15bae7605059b Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Tue, 18 Apr 2023 15:43:30 +0300
Subject: [PATCH 222/254] [depquant] update_state_eos_avx2 working

---
 src/dep_quant.c | 96 +++++++++++++++++++++++++++++--------------------
 1 file changed, 58 insertions(+), 38 deletions(-)

diff --git a/src/dep_quant.c b/src/dep_quant.c
index ef73d7ed..cef534fa 100644
--- a/src/dep_quant.c
+++ b/src/dep_quant.c
@@ -708,16 +708,10 @@ static void check_rd_costs_avx2(const all_depquant_states* const state, const en
         __m256i odd_64 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(odd, 0));
         rd_cost_a = _mm256_add_epi64(rd_cost_a, odd_64);
         rd_cost_b = _mm256_add_epi64(rd_cost_b, odd_64);
-        rd_cost_z = _mm256_add_epi64(rd_cost_z, even_64);
-        _mm256_storeu_epi64(temp_rd_cost_a, rd_cost_a);
-        _mm256_storeu_epi64(temp_rd_cost_b, rd_cost_b);
-        _mm256_storeu_epi64(temp_rd_cost_z, rd_cost_z);      
+        rd_cost_z = _mm256_add_epi64(rd_cost_z, even_64);     
       }
       else if (!state->m_numSigSbb[start] && !state->m_numSigSbb[start + 1] && !state->m_numSigSbb[start + 2] && !state->m_numSigSbb[start + 3]) {
         rd_cost_z = _mm256_setr_epi64x(decisions->rdCost[0], decisions->rdCost[0], decisions->rdCost[3], decisions->rdCost[3]);
-        _mm256_storeu_epi64(temp_rd_cost_a, rd_cost_a);
-        _mm256_storeu_epi64(temp_rd_cost_b, rd_cost_b);
-        _mm256_storeu_epi64(temp_rd_cost_z, rd_cost_z);
       }
 
       else {
@@ -735,11 +729,11 @@ static void check_rd_costs_avx2(const all_depquant_states* const state, const en
             temp_rd_cost_z[i] = decisions->rdCost[pqAs[i]];
           }
         }
+        rd_cost_a = _mm256_loadu_epi64(temp_rd_cost_a);
+        rd_cost_b = _mm256_loadu_epi64(temp_rd_cost_b);
+        rd_cost_z = _mm256_loadu_epi64(temp_rd_cost_z);
       }
     }
-    _mm256_storeu_epi64(temp_rd_cost_a, rd_cost_a);
-    _mm256_storeu_epi64(temp_rd_cost_b, rd_cost_b);
-    _mm256_storeu_epi64(temp_rd_cost_z, rd_cost_z);
   } else if (state->all_lt_four) {
     __m128i scale_bits = _mm_set1_epi32(1 << SCALE_BITS);
     __m128i max_rice = _mm_set1_epi32(31);
@@ -795,9 +789,6 @@ static void check_rd_costs_avx2(const all_depquant_states* const state, const en
       __m128i go_rice_tab = _mm_i32gather_epi32(&g_goRiceBits[0][0], go_rice_offset, 4);
       rd_cost_z = _mm256_add_epi64(rd_cost_z, _mm256_cvtepi32_epi64(go_rice_tab));
     }
-    _mm256_storeu_epi64(temp_rd_cost_a, rd_cost_a);
-    _mm256_storeu_epi64(temp_rd_cost_b, rd_cost_b);
-    _mm256_storeu_epi64(temp_rd_cost_z, rd_cost_z);
   } else {
     const int pqAs[4] = {0, 0, 3, 3};
     const int pqBs[4] = {2, 2, 1, 1};
@@ -1206,25 +1197,22 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
         memset(state->m_absLevelsAndCtxInit[state_offset + i], 0, 16 * sizeof(uint8_t));    
       }
     } else if (all_between_zero_and_three) {
-      prev_state = _mm_set1_epi32(ctxs->m_skip_state_offset);
+      prev_state = _mm_set1_epi32(ctxs->m_prev_state_offset);
       prev_state = _mm_add_epi32(
         prev_state,
-        _mm_sub_epi32(
-          _mm_loadu_epi32(decisions->prevId),
-          _mm_set1_epi32(4)
-        )
+        _mm_loadu_epi32(decisions->prevId)
       );
       __m128i num_sig_sbb = _mm_i32gather_epi32(&state->m_numSigSbb[state_offset], prev_state, 1);
       num_sig_sbb = _mm_and_epi32(num_sig_sbb, _mm_set1_epi32(0xff));
-      num_sig_sbb = _mm_and_epi32(
+      num_sig_sbb = _mm_add_epi32(
         num_sig_sbb,
-        _mm_max_epi32(abs_level, _mm_set1_epi32(1))
+        _mm_min_epi32(abs_level, _mm_set1_epi32(1))
       );
 
             __m128i control = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
       num_sig_sbb = _mm_shuffle_epi8(num_sig_sbb, control);
       int num_sig_sbb_s = _mm_extract_epi32(num_sig_sbb, 0);
-      memcpy(&state->m_refSbbCtxId[state_offset], &num_sig_sbb_s, 4);
+      memcpy(&state->m_numSigSbb[state_offset], &num_sig_sbb_s, 4);
 
       int32_t prev_state_scalar[4];
       _mm_storeu_epi32(prev_state_scalar, prev_state);
@@ -1288,13 +1276,14 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
       __m128i sbb_offsets = _mm_set_epi32(3 * numSbb, 2 * numSbb, 1 * numSbb, 0);
       __m128i next_sbb_right_m = _mm_set1_epi32(next_sbb_right);
       __m128i sbb_offsets_right = _mm_add_epi32(sbb_offsets, next_sbb_right_m);
-      __m128i sbb_right = next_sbb_right ? _mm_i32gather_epi32(&cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].sbbFlags, sbb_offsets_right, 1) : _mm_set1_epi32(0);
+      __m128i sbb_right = next_sbb_right ? _mm_i32gather_epi32(cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].sbbFlags, sbb_offsets_right, 1) : _mm_set1_epi32(0);
 
       __m128i sbb_offsets_below = _mm_add_epi32(sbb_offsets, _mm_set1_epi32(next_sbb_below));
-      __m128i sbb_below = next_sbb_right ? _mm_i32gather_epi32(&cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].sbbFlags, sbb_offsets_below, 1) : _mm_set1_epi32(0);
+      __m128i sbb_below = next_sbb_below ? _mm_i32gather_epi32(cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].sbbFlags, sbb_offsets_below, 1) : _mm_set1_epi32(0);
 
       __m128i sig_sbb = _mm_or_epi32(sbb_right, sbb_below);
-      sig_sbb = _mm_max_epi32(sig_sbb, _mm_set1_epi32(1));
+      sig_sbb         = _mm_and_epi32(sig_sbb, _mm_set1_epi32(0xff));
+      sig_sbb = _mm_min_epi32(sig_sbb, _mm_set1_epi32(1));
       __m256i sbb_frac_bits = _mm256_i32gather_epi64(cc->m_sbbFlagBits, sig_sbb, 8);
       _mm256_storeu_epi64(state->m_sbbFracBits[state_offset], sbb_frac_bits);
 
@@ -1327,11 +1316,15 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
       __m128i         fours = _mm_set1_epi32(4);
       __m256i          all[4];
       uint64_t         temp[4];
+      const __m256i v_shuffle = _mm256_set_epi8(15, 14,  7,  6, 13, 12,  5,  4, 11, 10,  3,  2,  9,  8,  1,  0,
+                                                31, 30, 23, 22, 29, 28, 21, 20, 27, 26, 19, 18, 25, 24, 17, 16);
+
       for (int id = 0; id < 16; id++, nbOut++) {
         if (nbOut->num == 0) {
           temp[id % 4] = 0;
           if (id % 4 == 3) {
-            all[0] = _mm256_loadu_epi64(temp);
+            all[id / 4] = _mm256_loadu_epi64(temp);
+            all[id / 4] = _mm256_shuffle_epi8(all[id / 4], v_shuffle);
           }
           continue;
         }
@@ -1345,7 +1338,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
             __m128i t = _mm_i32gather_epi32(absLevels, offset, 1);
             t = _mm_and_epi32(t, first_byte);
             sum_abs = _mm_add_epi32(sum_abs, t);
-            sum_num = _mm_add_epi32(sum_num, _mm_max_epi32(t, ones));
+            sum_num   = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
             __m128i min_t = _mm_min_epi32(
               t,
               _mm_add_epi32(
@@ -1360,7 +1353,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
             __m128i t = _mm_i32gather_epi32(absLevels, offset, 1);
             t = _mm_and_epi32(t, first_byte);
             sum_abs = _mm_add_epi32(sum_abs, t);
-            sum_num = _mm_add_epi32(sum_num, _mm_max_epi32(t, ones));
+            sum_num   = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
             __m128i min_t = _mm_min_epi32(
               t,
               _mm_add_epi32(
@@ -1373,7 +1366,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
             __m128i t = _mm_i32gather_epi32(absLevels, offset, 1);
             t = _mm_and_epi32(t, first_byte);
             sum_abs = _mm_add_epi32(sum_abs, t);
-            sum_num = _mm_add_epi32(sum_num, _mm_max_epi32(t, ones));
+            sum_num   = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
             __m128i min_t = _mm_min_epi32(
               t,
               _mm_add_epi32(
@@ -1386,7 +1379,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
             __m128i t = _mm_i32gather_epi32(absLevels, offset, 1);
             t = _mm_and_epi32(t, first_byte);
             sum_abs = _mm_add_epi32(sum_abs, t);
-            sum_num = _mm_add_epi32(sum_num, _mm_max_epi32(t, ones));
+            sum_num   = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
             __m128i min_t = _mm_min_epi32(
               t,
               _mm_add_epi32(
@@ -1399,7 +1392,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
             __m128i t = _mm_i32gather_epi32(absLevels, offset, 1);
             t = _mm_and_epi32(t, first_byte);
             sum_abs = _mm_add_epi32(sum_abs, t);
-            sum_num = _mm_add_epi32(sum_num, _mm_max_epi32(t, ones));
+            sum_num = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
             __m128i min_t = _mm_min_epi32(
               t,
               _mm_add_epi32(
@@ -1414,16 +1407,42 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
         sum_abs_1 = _mm_slli_epi32(sum_abs_1, 3);
         sum_abs = _mm_slli_epi32(_mm_min_epi32(_mm_set1_epi32(127), sum_abs), 8);
         __m128i template_ctx_init = _mm_add_epi32(sum_num, sum_abs);
-        _mm_add_epi32(template_ctx_init, sum_abs_1);
+        template_ctx_init = _mm_add_epi32(template_ctx_init, sum_abs_1);
         __m128i shuffle_mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 0, 0, 0, 0, 0, 0, 0, 0);
         __m128i shuffled_template_ctx_init = _mm_shuffle_epi8(template_ctx_init, shuffle_mask);
         temp[id % 4] = _mm_extract_epi64(shuffled_template_ctx_init, 0);
         if (id %4 == 3) {
-          all[0] = _mm256_loadu_epi64(temp);
+          all[id / 4] = _mm256_loadu_epi64(temp);
+          all[id / 4] = _mm256_shuffle_epi8(all[id / 4], v_shuffle);
           last = template_ctx_init;
         }
       }
 
+      __m256i* v_src_tmp = all;
+
+      __m256i v_tmp[4];
+      v_tmp[0] = _mm256_permute2x128_si256(v_src_tmp[0], v_src_tmp[1], 0x20);
+      v_tmp[1] = _mm256_permute2x128_si256(v_src_tmp[0], v_src_tmp[1], 0x31);
+      v_tmp[2] = _mm256_permute2x128_si256(v_src_tmp[2], v_src_tmp[3], 0x20);
+      v_tmp[3] = _mm256_permute2x128_si256(v_src_tmp[2], v_src_tmp[3], 0x31);
+
+      __m256i v_tmp16_lo[2];
+      __m256i v_tmp16_hi[2];
+      v_tmp16_lo[0] = _mm256_unpacklo_epi32(v_tmp[0], v_tmp[1]);
+      v_tmp16_lo[1] = _mm256_unpacklo_epi32(v_tmp[2], v_tmp[3]);
+      v_tmp16_hi[0] = _mm256_unpackhi_epi32(v_tmp[0], v_tmp[1]);
+      v_tmp16_hi[1] = _mm256_unpackhi_epi32(v_tmp[2], v_tmp[3]);
+
+      v_tmp[0] = _mm256_permute4x64_epi64(v_tmp16_lo[0], _MM_SHUFFLE(3, 1, 2, 0));
+      v_tmp[1] = _mm256_permute4x64_epi64(v_tmp16_lo[1], _MM_SHUFFLE(3, 1, 2, 0));
+      v_tmp[2] = _mm256_permute4x64_epi64(v_tmp16_hi[0], _MM_SHUFFLE(3, 1, 2, 0));
+      v_tmp[3] = _mm256_permute4x64_epi64(v_tmp16_hi[1], _MM_SHUFFLE(3, 1, 2, 0));
+
+      _mm256_storeu_epi16(state->m_absLevelsAndCtxInit[state_offset] + 8,  _mm256_permute2x128_si256(v_tmp[0], v_tmp[1], 0x20));
+      _mm256_storeu_epi16(state->m_absLevelsAndCtxInit[state_offset + 1] + 8,  _mm256_permute2x128_si256(v_tmp[0], v_tmp[1], 0x31));
+      _mm256_storeu_epi16(state->m_absLevelsAndCtxInit[state_offset + 2] + 8,  _mm256_permute2x128_si256(v_tmp[2], v_tmp[3], 0x20));
+      _mm256_storeu_epi16(state->m_absLevelsAndCtxInit[state_offset + 3] + 8,  _mm256_permute2x128_si256(v_tmp[2], v_tmp[3], 0x31));
+
       for (int i = 0; i < 4; ++i) {
         memset(state->m_absLevelsAndCtxInit[state_offset + i], 0, 16);
       }
@@ -1442,6 +1461,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
 
     __m128i offsets = _mm_set_epi32(12 * 3, 12 * 2, 12 * 1, 12 * 0);
     offsets = _mm_add_epi32(offsets, _mm_set1_epi32(sigCtxOffsetNext));
+    offsets         = _mm_add_epi32(offsets, sum_abs_min);
     __m256i sig_frac_bits = _mm256_i32gather_epi64(state->m_sigFracBitsArray[state_offset][0], offsets, 8);
     _mm256_storeu_epi64(&state->m_sigFracBits[state_offset][0], sig_frac_bits);
 
@@ -1451,7 +1471,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
     uint32_t sum_gt1_s[4];
     _mm_storeu_epi32(sum_gt1_s, min_gt1);
     for (int i = 0; i < 4; ++i) {
-      memcpy(state->m_coeffFracBits[state_offset + i], state->m_gtxFracBitsArray[sum_gt1_s[i]], sizeof(state->m_coeffFracBits[0]));
+      memcpy(state->m_coeffFracBits[state_offset + i], state->m_gtxFracBitsArray[sum_gt1_s[i] + gtxCtxOffsetNext], sizeof(state->m_coeffFracBits[0]));
     }
   }
   else {
@@ -2217,14 +2237,14 @@ static void xDecideAndUpdate(
   if (scan_pos) {
     if (!(scan_pos & 15)) {
       SWAP(ctxs->m_common_context.m_curr_sbb_ctx_offset, ctxs->m_common_context.m_prev_sbb_ctx_offset, int);
-      updateStateEOS(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions, 0);
-      updateStateEOS(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions, 1);
-      updateStateEOS(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions, 2);
-      updateStateEOS(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions, 3);
+      update_state_eos_avx2(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions);
+      //updateStateEOS(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions, 0);
+      //updateStateEOS(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions, 1);
+      //updateStateEOS(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions, 2);
+      //updateStateEOS(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions, 3);
       memcpy(decisions->prevId + 4, decisions->prevId, 4 * sizeof(int32_t));
       memcpy(decisions->absLevel + 4, decisions->absLevel, 4 * sizeof(int32_t));
       memcpy(decisions->rdCost + 4, decisions->rdCost, 4 * sizeof(int64_t));
-      printf("\n");
     } else if (!zeroOut) {
       update_states_avx2(ctxs, next_nb_info_ssb.num, scan_pos, decisions, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], next_nb_info_ssb, 4, false);
     /*  updateState(ctxs, next_nb_info_ssb.num, scan_pos, decisions, sigCtxOffsetNext, gtxCtxOffsetNext, next_nb_info_ssb, 4, false, 0);

From 7fdc045690e08aac57d0cfe4f29dfd78a0a6fbb4 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 19 Apr 2023 12:01:24 +0300
Subject: [PATCH 223/254] [dep_quant] Clean up

---
 src/dep_quant.c    | 165 ++++++++++++++++++++++++---------------------
 src/dep_quant.h    |   2 +-
 src/encoderstate.h |   2 +-
 src/transform.c    |   1 +
 4 files changed, 92 insertions(+), 78 deletions(-)

diff --git a/src/dep_quant.c b/src/dep_quant.c
index cef534fa..6dfa1f14 100644
--- a/src/dep_quant.c
+++ b/src/dep_quant.c
@@ -40,6 +40,7 @@
 #include "uvg_math.h"
 #include "generic/quant-generic.h"
 #include <immintrin.h>
+#include <zmmintrin.h>
 
 
 
@@ -246,7 +247,6 @@ int uvg_init_nb_info(encoder_control_t * encoder) {
           {
             nbSbb->inPos[k] = 0;
           }
-          printf("");
         }
         {
           //===== outside subband neighbours =====
@@ -416,7 +416,7 @@ static void init_quant_block(
   qp->needs_init = false;
 }
 
-static void reset_common_context(common_context* ctx, const rate_estimator * rate_estimator, int numSbb, int num_coeff)
+static void reset_common_context(common_context* ctx, const rate_estimator_t * rate_estimator, int numSbb, int num_coeff)
 {
   //memset(&ctx->m_nbInfo, 0, sizeof(ctx->m_nbInfo));
   memcpy(&ctx->m_sbbFlagBits, &rate_estimator->m_sigSbbFracBits, sizeof(rate_estimator->m_sigSbbFracBits));
@@ -431,7 +431,7 @@ static void reset_common_context(common_context* ctx, const rate_estimator * rat
   ctx->num_coeff = num_coeff;
 }
 
-static void init_rate_esimator(rate_estimator * rate_estimator, const cabac_data_t * const ctx, color_t color)
+static void init_rate_esimator(rate_estimator_t * rate_estimator, const cabac_data_t * const ctx, color_t color)
 {
   const cabac_ctx_t * base_ctx = color == COLOR_Y ? ctx->ctx.sig_coeff_group_model : (ctx->ctx.sig_coeff_group_model + 2);
   for (unsigned ctxId = 0; ctxId < SM_MAX_NUM_SIG_SBB_CTX; ctxId++) {
@@ -453,7 +453,7 @@ static void init_rate_esimator(rate_estimator * rate_estimator, const cabac_data
     const cabac_ctx_t * gt2_ctx = color == COLOR_Y ? &ctx->ctx.cu_gtx_flag_model_luma[0][ctxId] : &ctx->ctx.cu_gtx_flag_model_chroma[0][ctxId];
     const cabac_ctx_t * gt1_ctx = color == COLOR_Y ? &ctx->ctx.cu_gtx_flag_model_luma[1][ctxId] : &ctx->ctx.cu_gtx_flag_model_chroma[1][ctxId];
 
-    int32_t* cb = &rate_estimator->m_gtxFracBits[ctxId];
+    int32_t* cb = rate_estimator->m_gtxFracBits[ctxId];
     int32_t par0    = (1 << SCALE_BITS) + (int32_t)CTX_ENTROPY_BITS(par_ctx, 0);
     int32_t par1 = (1 << SCALE_BITS) + (int32_t)CTX_ENTROPY_BITS(par_ctx, 1);
     cb[0] = 0;
@@ -471,7 +471,7 @@ static void xSetLastCoeffOffset(
   const cu_info_t* const cur_tu,
   const int width,
   const int height,
-  rate_estimator* rate_estimator,
+  rate_estimator_t* rate_estimator,
   const color_t compID)
 {
   int32_t cbfDeltaBits = 0;
@@ -579,7 +579,7 @@ static void check_rd_costs_avx2(const all_depquant_states* const state, const en
   __m256i pq_a_delta_dist = _mm256_setr_epi64x(pqDataA->deltaDist[0], pqDataA->deltaDist[0], pqDataA->deltaDist[3], pqDataA->deltaDist[3]);
   __m256i pq_b_delta_dist = _mm256_setr_epi64x(pqDataA->deltaDist[2], pqDataA->deltaDist[2], pqDataA->deltaDist[1], pqDataA->deltaDist[1]);
 
-  __m256i rd_cost_a = _mm256_loadu_si256(&state->m_rdCost[start]);
+  __m256i rd_cost_a = _mm256_load_si256((__m256i const*)&state->m_rdCost[start]);
   __m256i rd_cost_b = rd_cost_a;
   __m256i rd_cost_z = rd_cost_a;
 
@@ -611,7 +611,7 @@ static void check_rd_costs_avx2(const all_depquant_states* const state, const en
       rd_cost_a = _mm256_add_epi64(rd_cost_a, _mm256_cvtepi32_epi64(temp));
     } else {
       const int pqAs[4] = {0, 0, 3, 3};
-      int64_t rd_costs[4] = {0, 0, 0, 0}; 
+      ALIGNED(32) int64_t rd_costs[4] = {0, 0, 0, 0}; 
       for (int i = 0; i < 4; i++) {
         const int      state_offset = start + i;
         const int      pqA = pqAs[i];
@@ -623,7 +623,7 @@ static void check_rd_costs_avx2(const all_depquant_states* const state, const en
           rd_costs[i] += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA] - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1];
         }
       }
-      rd_cost_a = _mm256_add_epi64(rd_cost_a, _mm256_loadu_si256(&rd_costs[0]));
+      rd_cost_a = _mm256_add_epi64(rd_cost_a, _mm256_loadu_si256((__m256i const *)&rd_costs[0]));
     }
 
     if (pqDataA->absLevel[1] < 4 && pqDataA->absLevel[2] < 4) {
@@ -661,7 +661,8 @@ static void check_rd_costs_avx2(const all_depquant_states* const state, const en
           rd_costs[i] += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqB] - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1];
         }
       }
-      rd_cost_b = _mm256_add_epi64(rd_cost_b, _mm256_loadu_si256(&rd_costs[0]));
+      rd_cost_b =
+        _mm256_add_epi64(rd_cost_b, _mm256_loadu_si256((__m256i const *) & rd_costs[0]));
     }
 
     if (spt == SCAN_ISCSBB) {
@@ -871,7 +872,7 @@ static void check_rd_costs_avx2(const all_depquant_states* const state, const en
 
   _mm256_storeu_epi64(decisions->rdCost, final_rd_cost);
   final_data = _mm256_permutevar8x32_epi32(final_data, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
-  _mm256_storeu2_m128i(decisions->prevId, decisions->absLevel, final_data);
+  _mm256_storeu2_m128i((__m128i *)decisions->prevId, (__m128i *)decisions->absLevel, final_data);
 }
 
 
@@ -984,7 +985,7 @@ static INLINE void checkRdCostStart(const depquant_state* const state, int32_t l
 static INLINE void preQuantCoeff(const quant_block * const qp, const coeff_t absCoeff, PQData* pqData, coeff_t quanCoeff)
 {
   int64_t scaledOrg = (int64_t)(absCoeff) * quanCoeff;
-  coeff_t  qIdx = MAX(1, MIN(qp->m_maxQIdx, (coeff_t)((scaledOrg + qp->m_QAdd) >> qp->m_QShift)));
+  coeff_t  qIdx = MAX(1, (coeff_t)MIN(qp->m_maxQIdx, ((scaledOrg + qp->m_QAdd) >> qp->m_QShift)));
   int64_t scaledAdd = qIdx * qp->m_DistStepAdd - scaledOrg * qp->m_DistOrgFact;
   int index = qIdx & 3;
   pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
@@ -1182,34 +1183,34 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
   if (all_above_minus_two) {
     bool all_have_previous_state = true;
     __m128i prev_state;
+    __m128i prev_state_no_offset;
     __m128i abs_level = _mm_loadu_epi32(decisions->absLevel);
     if (all_above_four) {
       prev_state = _mm_set1_epi32(ctxs->m_skip_state_offset);
+      prev_state_no_offset = _mm_sub_epi32(_mm_loadu_epi32(decisions->prevId), _mm_set1_epi32(4));
       prev_state = _mm_add_epi32(
         prev_state,
-        _mm_sub_epi32(
-          _mm_loadu_epi32(decisions->prevId),
-          _mm_set1_epi32(4)
-        )
+            prev_state_no_offset
       );
       memset(&state->m_numSigSbb[state_offset], 0, 4);
       for (int i = 0; i < 4; ++i) {
         memset(state->m_absLevelsAndCtxInit[state_offset + i], 0, 16 * sizeof(uint8_t));    
       }
     } else if (all_between_zero_and_three) {
-      prev_state = _mm_set1_epi32(ctxs->m_prev_state_offset);
+      prev_state_no_offset = _mm_set1_epi32(ctxs->m_prev_state_offset);
       prev_state = _mm_add_epi32(
-        prev_state,
+        prev_state_no_offset,
         _mm_loadu_epi32(decisions->prevId)
       );
-      __m128i num_sig_sbb = _mm_i32gather_epi32(&state->m_numSigSbb[state_offset], prev_state, 1);
-      num_sig_sbb = _mm_and_epi32(num_sig_sbb, _mm_set1_epi32(0xff));
+      __m128i control = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+      __m128i prev_state_with_ff_high_bytes = _mm_or_epi32(prev_state, _mm_set1_epi32(0xffffff00));
+      __m128i num_sig_sbb = _mm_loadu_epi32(state->m_numSigSbb);
+      num_sig_sbb = _mm_shuffle_epi8(num_sig_sbb, prev_state_with_ff_high_bytes);
       num_sig_sbb = _mm_add_epi32(
         num_sig_sbb,
         _mm_min_epi32(abs_level, _mm_set1_epi32(1))
       );
 
-            __m128i control = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
       num_sig_sbb = _mm_shuffle_epi8(num_sig_sbb, control);
       int num_sig_sbb_s = _mm_extract_epi32(num_sig_sbb, 0);
       memcpy(&state->m_numSigSbb[state_offset], &num_sig_sbb_s, 4);
@@ -1221,15 +1222,18 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
       }
     } else {
       int prev_state_s[4] = {-1, -1, -1, -1};
+      int prev_state_no_offset_s[4] = {-1, -1, -1, -1};
       for (int i = 0; i < 4; ++i) {
         const int decision_id = i;
         const int curr_state_offset = state_offset + i;
         if (decisions->prevId[decision_id] >= 4) {
           prev_state_s[i] = ctxs->m_skip_state_offset + (decisions->prevId[decision_id] - 4);
+          prev_state_no_offset_s[i] = decisions->prevId[decision_id] - 4;
           state->m_numSigSbb[curr_state_offset] = 0;
           memset(state->m_absLevelsAndCtxInit[curr_state_offset], 0, 16 * sizeof(uint8_t));
         } else if (decisions->prevId[decision_id] >= 0) {
           prev_state_s[i] = ctxs->m_prev_state_offset + decisions->prevId[decision_id];
+          prev_state_no_offset_s[i] = decisions->prevId[decision_id];
           state->m_numSigSbb[curr_state_offset] = state->m_numSigSbb[prev_state_s[i]] + !!decisions->absLevel[decision_id];
           memcpy(state->m_absLevelsAndCtxInit[curr_state_offset], state->m_absLevelsAndCtxInit[prev_state_s[i]], 16 * sizeof(uint8_t));
         } else {
@@ -1239,6 +1243,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
         }
       }
       prev_state = _mm_loadu_epi32(prev_state_s);
+      prev_state_no_offset = _mm_loadu_epi32(prev_state_no_offset_s);
     }
     uint32_t level_offset = scan_pos & 15;
     __m128i  max_abs = _mm_min_epi32(abs_level, _mm_set1_epi32(32));
@@ -1276,15 +1281,24 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
       __m128i sbb_offsets = _mm_set_epi32(3 * numSbb, 2 * numSbb, 1 * numSbb, 0);
       __m128i next_sbb_right_m = _mm_set1_epi32(next_sbb_right);
       __m128i sbb_offsets_right = _mm_add_epi32(sbb_offsets, next_sbb_right_m);
-      __m128i sbb_right = next_sbb_right ? _mm_i32gather_epi32(cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].sbbFlags, sbb_offsets_right, 1) : _mm_set1_epi32(0);
+      __m128i sbb_right = next_sbb_right ? _mm_i32gather_epi32((const int *)cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].sbbFlags, sbb_offsets_right, 1) : _mm_set1_epi32(0);
 
       __m128i sbb_offsets_below = _mm_add_epi32(sbb_offsets, _mm_set1_epi32(next_sbb_below));
-      __m128i sbb_below = next_sbb_below ? _mm_i32gather_epi32(cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].sbbFlags, sbb_offsets_below, 1) : _mm_set1_epi32(0);
+      __m128i sbb_below = next_sbb_below ? _mm_i32gather_epi32((const int *)cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].sbbFlags, sbb_offsets_below, 1) : _mm_set1_epi32(0);
 
       __m128i sig_sbb = _mm_or_epi32(sbb_right, sbb_below);
       sig_sbb         = _mm_and_epi32(sig_sbb, _mm_set1_epi32(0xff));
       sig_sbb = _mm_min_epi32(sig_sbb, _mm_set1_epi32(1));
-      __m256i sbb_frac_bits = _mm256_i32gather_epi64(cc->m_sbbFlagBits, sig_sbb, 8);
+      //__m256i sig_sbb_mask = _mm256_cvtepi32_epi64(sig_sbb);
+      //const __m256i duplication_mask = _mm256_setr_epi8(
+      //  0, 0, 0, 0, 0, 0, 0, 0,
+      //  1, 1, 1, 1, 1, 1, 1, 1, 
+      //  2, 2, 2, 2, 2, 2, 2, 2, 
+      //  3, 3, 3, 3, 3, 3, 3, 3);
+      //sig_sbb_mask = _mm256_shuffle_epi8(sig_sbb_mask, duplication_mask);
+      __m256i sbb_frac_bits = _mm256_i32gather_epi64((int64_t *)cc->m_sbbFlagBits[0], sig_sbb, 8);
+      //__m256i sbb_frac_bits = _mm256_loadu_epi64(cc->m_sbbFlagBits);
+      //sbb_frac_bits = _mm256_shu
       _mm256_storeu_epi64(state->m_sbbFracBits[state_offset], sbb_frac_bits);
 
       memset(&state->m_numSigSbb[state_offset], 0, 4);
@@ -1294,6 +1308,9 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
       memcpy(&state->m_refSbbCtxId[state_offset], states, 4);
       if (all_have_previous_state) {
         __m128i rem_reg_bins = _mm_i32gather_epi32(state->m_remRegBins, prev_state, 4);
+        //prev_state_no_offset = _mm_shuffle_epi8(prev_state_no_offset, _mm_setr_epi8(0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3));
+        //__m128i rem_reg_bins = _mm_loadu_epi32(&state->m_remRegBins[previous_state_array[0] & 0xfc]);
+        //rem_reg_bins = _mm_shuffle_epi8(rem_reg_bins, mask);
         _mm_storeu_epi32(&state->m_remRegBins[state_offset], rem_reg_bins);
       } else {
         const int temp = (state->effWidth * state->effHeight * 28) / 16;
@@ -1335,7 +1352,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
         case 5:
           {
             __m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[4]));
-            __m128i t = _mm_i32gather_epi32(absLevels, offset, 1);
+            __m128i t = _mm_i32gather_epi32((const int *)absLevels, offset, 1);
             t = _mm_and_epi32(t, first_byte);
             sum_abs = _mm_add_epi32(sum_abs, t);
             sum_num   = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
@@ -1350,7 +1367,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
           }
         case 4: {
             __m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[3]));
-            __m128i t = _mm_i32gather_epi32(absLevels, offset, 1);
+            __m128i t     = _mm_i32gather_epi32((const int*)absLevels, offset, 1);
             t = _mm_and_epi32(t, first_byte);
             sum_abs = _mm_add_epi32(sum_abs, t);
             sum_num   = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
@@ -1363,7 +1380,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
         }
         case 3: {
             __m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[2]));
-            __m128i t = _mm_i32gather_epi32(absLevels, offset, 1);
+            __m128i t     = _mm_i32gather_epi32((const int*)absLevels, offset, 1);
             t = _mm_and_epi32(t, first_byte);
             sum_abs = _mm_add_epi32(sum_abs, t);
             sum_num   = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
@@ -1376,7 +1393,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
         }
         case 2: {
             __m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[1]));
-            __m128i t = _mm_i32gather_epi32(absLevels, offset, 1);
+            __m128i t     = _mm_i32gather_epi32((const int*)absLevels, offset, 1);
             t = _mm_and_epi32(t, first_byte);
             sum_abs = _mm_add_epi32(sum_abs, t);
             sum_num   = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
@@ -1389,7 +1406,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
         }
         case 1: {
             __m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[0]));
-            __m128i t = _mm_i32gather_epi32(absLevels, offset, 1);
+            __m128i t = _mm_i32gather_epi32((const int*)absLevels, offset, 1);
             t = _mm_and_epi32(t, first_byte);
             sum_abs = _mm_add_epi32(sum_abs, t);
             sum_num = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
@@ -1462,7 +1479,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
     __m128i offsets = _mm_set_epi32(12 * 3, 12 * 2, 12 * 1, 12 * 0);
     offsets = _mm_add_epi32(offsets, _mm_set1_epi32(sigCtxOffsetNext));
     offsets         = _mm_add_epi32(offsets, sum_abs_min);
-    __m256i sig_frac_bits = _mm256_i32gather_epi64(state->m_sigFracBitsArray[state_offset][0], offsets, 8);
+    __m256i sig_frac_bits = _mm256_i32gather_epi64((const int64_t *)&state->m_sigFracBitsArray[state_offset][0][0], offsets, 8);
     _mm256_storeu_epi64(&state->m_sigFracBits[state_offset][0], sig_frac_bits);
 
 
@@ -1588,34 +1605,29 @@ static INLINE void update_states_avx2(
       __m128i prv_states = _mm_loadu_epi32(decisions->prevId);
       __m128i prev_offset = _mm_set1_epi32(ctxs->m_prev_state_offset);
       prv_states = _mm_add_epi32(prv_states, prev_offset);
-
-
-      //__m128i num_sig_sbb = _mm_i32gather_epi32(state->m_numSigSbb, prv_states, 1);
-      //__m128 mask = _mm_set_epi32(0xff, 0xff, 0xff, 0xff);
-      //num_sig_sbb 
-
-
-      int32_t prv_states_scalar[4];
-      _mm_storeu_epi32(prv_states_scalar, prv_states);
-      int8_t sig_sbb[4] = {state->m_numSigSbb[prv_states_scalar[0]], state->m_numSigSbb[prv_states_scalar[1]], state->m_numSigSbb[prv_states_scalar[2]], state->m_numSigSbb[prv_states_scalar[3]]};
-      for (int i = 0; i < 4; ++i) {
-        sig_sbb[i] = sig_sbb[i] || decisions->absLevel[i];
-      }
-      memcpy(&state->m_numSigSbb[state_offset], sig_sbb, 4);
-
-      __m128i ref_sbb_ctx_idx = _mm_i32gather_epi32(state->m_refSbbCtxId, prv_states, 1);
       __m128i control = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
-      ref_sbb_ctx_idx = _mm_shuffle_epi8(ref_sbb_ctx_idx, control);
+      __m128i shuffled_prev_states = _mm_shuffle_epi8(prv_states, control);
+      
+      __m128i sig_sbb = _mm_loadu_epi32(state->m_numSigSbb);
+      sig_sbb = _mm_shuffle_epi8(sig_sbb, shuffled_prev_states);
+      __m128i has_coeff = _mm_min_epi32(abs_level, _mm_set1_epi32(1));
+      has_coeff         = _mm_shuffle_epi8(has_coeff, control);
+      sig_sbb           = _mm_or_epi32(sig_sbb, has_coeff);
+      int sig_sbb_i = _mm_extract_epi32(sig_sbb, 0);
+      memcpy(&state->m_numSigSbb[state_offset], &sig_sbb_i, 4);
+      
+      __m128i ref_sbb_ctx_idx = _mm_loadu_epi32(state->m_refSbbCtxId);
+      ref_sbb_ctx_idx = _mm_shuffle_epi8(ref_sbb_ctx_idx, shuffled_prev_states);
       int ref_sbb_ctx = _mm_extract_epi32(ref_sbb_ctx_idx, 0);
       memcpy(&state->m_refSbbCtxId[state_offset], &ref_sbb_ctx, 4);
-
-      __m128i go_rice_par = _mm_i32gather_epi32(state->m_goRicePar, prv_states, 1);
-      go_rice_par = _mm_shuffle_epi8(go_rice_par, control);
+      
+      __m128i go_rice_par = _mm_loadu_epi32(state->m_goRicePar);
+      go_rice_par = _mm_shuffle_epi8(go_rice_par, shuffled_prev_states);
       int go_rice_par_i = _mm_extract_epi32(go_rice_par, 0);
       memcpy(&state->m_goRicePar[state_offset], &go_rice_par_i, 4);
 
       
-      __m256i sbb_frac_bits = _mm256_i32gather_epi64(state->m_sbbFracBits, prv_states, 8);
+      __m256i sbb_frac_bits = _mm256_i32gather_epi64((const int64_t *)state->m_sbbFracBits[0], prv_states, 8);
       _mm256_storeu_epi64(&state->m_sbbFracBits[state_offset][0], sbb_frac_bits);
 
       __m128i rem_reg_bins = _mm_i32gather_epi32(state->m_remRegBins, prv_states, 4);
@@ -1638,6 +1650,8 @@ static INLINE void update_states_avx2(
       bit_mask = _mm_movemask_epi8(mask); 
       rem_reg_all_lt4 = (bit_mask == 0xFFFF);
 
+      int32_t prv_states_scalar[4];
+      _mm_storeu_epi32(prv_states_scalar, prv_states);
       for (int i = 0; i < 4; ++i) {
         memcpy(state->m_absLevelsAndCtxInit[state_offset + i], state->m_absLevelsAndCtxInit[prv_states_scalar[i]], 48 * sizeof(uint8_t));        
       }
@@ -1713,19 +1727,19 @@ static INLINE void update_states_avx2(
       const __m128i levels_start_offsets = _mm_set_epi32(48 * 3, 48 * 2, 48 * 1, 48 * 0);
       const __m128i ctx_start_offsets = _mm_srli_epi32(levels_start_offsets, 1);
       __m128i        tinit = _mm_i32gather_epi32(
-        state->m_absLevelsAndCtxInit[state_offset],
+        (int *)state->m_absLevelsAndCtxInit[state_offset],
         _mm_add_epi32(ctx_start_offsets, _mm_set1_epi32(tinit_offset)),
         2);
       tinit = _mm_and_epi32(tinit, first_two_bytes);
       __m128i sum_abs1 = _mm_and_epi32(_mm_srli_epi32(tinit, 3), _mm_set1_epi32(31));
       __m128i sum_num = _mm_and_epi32(tinit, _mm_set1_epi32(7));
 
-      uint8_t* levels = state->m_absLevelsAndCtxInit[state_offset];
+      uint8_t* levels = (uint8_t*)state->m_absLevelsAndCtxInit[state_offset];
       switch (numIPos) {
       case 5:
         {
           __m128i t = _mm_i32gather_epi32(
-            levels,
+            (int *)levels,
             _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[4])),
             1);
           t = _mm_and_epi32(t, first_byte);
@@ -1744,7 +1758,7 @@ static INLINE void update_states_avx2(
       case 4:
         {
           __m128i t = _mm_i32gather_epi32(
-            levels,
+            (int*)levels,
             _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[3])),
             1);
           t = _mm_and_epi32(t, first_byte);
@@ -1763,7 +1777,7 @@ static INLINE void update_states_avx2(
       case 3:
         {
           __m128i t = _mm_i32gather_epi32(
-            levels,
+            (int*)levels,
             _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[2])),
             1);
           t = _mm_and_epi32(t, first_byte);
@@ -1782,7 +1796,7 @@ static INLINE void update_states_avx2(
       case 2:
         {
           __m128i t = _mm_i32gather_epi32(
-            levels,
+            (int*)levels,
             _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[1])),
             1);
           t = _mm_and_epi32(t, first_byte);
@@ -1800,7 +1814,7 @@ static INLINE void update_states_avx2(
         }
       case 1: {
           __m128i t = _mm_i32gather_epi32(
-            levels,
+            (int*)levels,
             _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[0])),
             1);
           t = _mm_and_epi32(t, first_byte);
@@ -1826,7 +1840,7 @@ static INLINE void update_states_avx2(
         _mm_srli_epi32(_mm_add_epi32(sum_abs1, ones), 1),
         _mm_set1_epi32(3));
       offsets = _mm_add_epi32(offsets, temp);
-      __m256i sig_frac_bits = _mm256_i32gather_epi64(state->m_sigFracBitsArray[state_offset][0], offsets, 8);
+      __m256i sig_frac_bits = _mm256_i32gather_epi64((const int64_t *)state->m_sigFracBitsArray[state_offset][0], offsets, 8);
       _mm256_storeu_epi64(&state->m_sigFracBits[state_offset][0], sig_frac_bits);
 
       sum_gt1 = _mm_min_epi32(sum_gt1, _mm_set1_epi32(4));
@@ -1843,7 +1857,7 @@ static INLINE void update_states_avx2(
         case 5:
           {
           __m128i t = _mm_i32gather_epi32(
-            levels,
+            (int*)levels,
             _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[4])),
             1);
           sum_abs = _mm_add_epi32(t, sum_abs);
@@ -1851,7 +1865,7 @@ static INLINE void update_states_avx2(
         case 4:
           {
           __m128i t = _mm_i32gather_epi32(
-            levels,
+            (int*)levels,
             _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[3])),
             1);
           sum_abs = _mm_add_epi32(t, sum_abs);
@@ -1859,7 +1873,7 @@ static INLINE void update_states_avx2(
         case 3:
           {
           __m128i t = _mm_i32gather_epi32(
-            levels,
+            (int*)levels,
             _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[2])),
             1);
           sum_abs = _mm_add_epi32(t, sum_abs);
@@ -1867,7 +1881,7 @@ static INLINE void update_states_avx2(
         case 2:
           {
           __m128i t = _mm_i32gather_epi32(
-            levels,
+            (int*)levels,
             _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[1])),
             1);
           sum_abs = _mm_add_epi32(t, sum_abs);
@@ -1875,7 +1889,7 @@ static INLINE void update_states_avx2(
         case 1:
           {
           __m128i t = _mm_i32gather_epi32(
-            levels,
+            (int*)levels,
             _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[0])),
             1);
           sum_abs = _mm_add_epi32(t, sum_abs);
@@ -1901,15 +1915,14 @@ static INLINE void update_states_avx2(
     }
 
     else if (rem_reg_all_lt4) {
-      uint8_t*       levels = state->m_absLevelsAndCtxInit[state_offset];
+      uint8_t*       levels = (uint8_t*)state->m_absLevelsAndCtxInit[state_offset];
       const __m128i  last_two_bytes = _mm_set1_epi32(0xffff);
       const __m128i  last_byte = _mm_set1_epi32(0xff);
-      const __m128i  ones = _mm_set1_epi32(1);
       const uint32_t tinit_offset = MIN(level_offset - 1u, 15u) + 8;
       const __m128i levels_start_offsets = _mm_set_epi32(48 * 3, 48 * 2, 48 * 1, 48 * 0);
       const __m128i ctx_start_offsets = _mm_srli_epi32(levels_start_offsets, 1);
       __m128i       tinit = _mm_i32gather_epi32(
-        state->m_absLevelsAndCtxInit[state_offset],
+        (int*)state->m_absLevelsAndCtxInit[state_offset],
         _mm_add_epi32(ctx_start_offsets, _mm_set1_epi32(tinit_offset)),
         2);
       tinit = _mm_and_epi32(tinit, last_two_bytes);
@@ -1917,7 +1930,7 @@ static INLINE void update_states_avx2(
       switch (numIPos) {
         case 5: {
           __m128i t = _mm_i32gather_epi32(
-            levels,
+            (int*)levels,
             _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[4])),
             1);
           t = _mm_and_epi32(t, last_byte);
@@ -1925,7 +1938,7 @@ static INLINE void update_states_avx2(
         }
         case 4: {
           __m128i t = _mm_i32gather_epi32(
-            levels,
+            (int*)levels,
             _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[3])),
             1);
           t = _mm_and_epi32(t, last_byte);
@@ -1933,7 +1946,7 @@ static INLINE void update_states_avx2(
         }
         case 3: {
           __m128i t = _mm_i32gather_epi32(
-            levels,
+            (int*)levels,
             _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[2])),
             1);
           t = _mm_and_epi32(t, last_byte);
@@ -1941,7 +1954,7 @@ static INLINE void update_states_avx2(
         }
         case 2: {
           __m128i t = _mm_i32gather_epi32(
-            levels,
+            (int*)levels,
             _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[1])),
             1);
           t = _mm_and_epi32(t, last_byte);
@@ -1949,7 +1962,7 @@ static INLINE void update_states_avx2(
         }
         case 1: {
           __m128i t = _mm_i32gather_epi32(
-            levels,
+            (int*)levels,
             _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[0])),
             1);
           t = _mm_and_epi32(t, last_byte);
@@ -2205,7 +2218,7 @@ static INLINE void updateState(
 
 static bool same[13];
 static void xDecideAndUpdate(
-  rate_estimator*                         re,
+  rate_estimator_t*                         re,
   context_store*                          ctxs,
   struct dep_quant_scan_info const* const scan_info,
   const coeff_t                           absCoeff,
@@ -2215,8 +2228,8 @@ static void xDecideAndUpdate(
   const NbInfoSbb                         next_nb_info_ssb,
   bool                                    zeroOut,
   coeff_t                                 quantCoeff,
-  int                                     effWidth,
-  int                                     effHeight,
+  const uint32_t                          effWidth,
+  const uint32_t                          effHeight,
   bool                                    is_chroma)
 {
   Decision* decisions = &ctxs->m_trellis[scan_pos];
@@ -2356,8 +2369,8 @@ int uvg_dep_quant(
   }
 
   //===== real init =====
-  rate_estimator* rate_estimator = compID == COLOR_Y && cur_tu->type == CU_INTRA && cur_tu->intra.isp_mode != ISP_MODE_NO_ISP ?
-    &state->rate_estimator[3] : &state->rate_estimator[compID];
+  rate_estimator_t* rate_estimator = (rate_estimator_t *)(compID == COLOR_Y && cur_tu->type == CU_INTRA && cur_tu->intra.isp_mode != ISP_MODE_NO_ISP ?
+    &state->rate_estimator[3] : &state->rate_estimator[compID]);
   if(rate_estimator->needs_init || cur_tu->type == CU_INTER) {
     init_rate_esimator(rate_estimator, &state->search_cabac, compID);
     xSetLastCoeffOffset(state, cur_tu, width, height, rate_estimator, compID);
diff --git a/src/dep_quant.h b/src/dep_quant.h
index a56b3941..ebb54d31 100644
--- a/src/dep_quant.h
+++ b/src/dep_quant.h
@@ -81,7 +81,7 @@ typedef struct
   uint32_t m_sigFracBits[SM_NUM_CTX_SETS_SIG][SM_MAX_NUM_SIG_CTX][2];
   int32_t  m_gtxFracBits[SM_MAX_NUM_GTX_CTX][6];
   bool     needs_init;
-} rate_estimator;
+} rate_estimator_t;
 
 
 typedef struct
diff --git a/src/encoderstate.h b/src/encoderstate.h
index 1779caaa..88409703 100644
--- a/src/encoderstate.h
+++ b/src/encoderstate.h
@@ -368,7 +368,7 @@ typedef struct encoder_state_t {
   int8_t collocated_luma_mode;
 
   quant_block quant_blocks[3]; // luma, ISP, chroma
-  rate_estimator rate_estimator[4]; // luma, cb, cr, isp
+  rate_estimator_t rate_estimator[4]; // luma, cb, cr, isp
 } encoder_state_t;
 
 void uvg_encode_one_frame(encoder_state_t * const state, uvg_picture* frame);
diff --git a/src/transform.c b/src/transform.c
index 5fb03abd..969394df 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -437,6 +437,7 @@ static void quantize_chroma(
   int8_t height = cu_loc->chroma_height;
   if(state->encoder_control->cfg.dep_quant && transform != CHROMA_TS) {
     int abs_sum = 0;
+    state->quant_blocks[1].needs_init = state->encoder_control->cfg.jccr;
     uvg_dep_quant(
       state,
       cur_tu,

From 6d0a3fa5fca4ac87df4db1c936345979fe613d6c Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 19 Apr 2023 12:34:43 +0300
Subject: [PATCH 224/254] [avx2] Replace _mm_and_epi32 with _mm_and_si128

---
 src/dep_quant.c | 77 ++++++++++++++++++++++++-------------------------
 1 file changed, 38 insertions(+), 39 deletions(-)

diff --git a/src/dep_quant.c b/src/dep_quant.c
index 6dfa1f14..14659709 100644
--- a/src/dep_quant.c
+++ b/src/dep_quant.c
@@ -40,7 +40,6 @@
 #include "uvg_math.h"
 #include "generic/quant-generic.h"
 #include <immintrin.h>
-#include <zmmintrin.h>
 
 
 
@@ -1287,7 +1286,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
       __m128i sbb_below = next_sbb_below ? _mm_i32gather_epi32((const int *)cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].sbbFlags, sbb_offsets_below, 1) : _mm_set1_epi32(0);
 
       __m128i sig_sbb = _mm_or_epi32(sbb_right, sbb_below);
-      sig_sbb         = _mm_and_epi32(sig_sbb, _mm_set1_epi32(0xff));
+      sig_sbb         = _mm_and_si128(sig_sbb, _mm_set1_epi32(0xff));
       sig_sbb = _mm_min_epi32(sig_sbb, _mm_set1_epi32(1));
       //__m256i sig_sbb_mask = _mm256_cvtepi32_epi64(sig_sbb);
       //const __m256i duplication_mask = _mm256_setr_epi8(
@@ -1353,14 +1352,14 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
           {
             __m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[4]));
             __m128i t = _mm_i32gather_epi32((const int *)absLevels, offset, 1);
-            t = _mm_and_epi32(t, first_byte);
+            t = _mm_and_si128(t, first_byte);
             sum_abs = _mm_add_epi32(sum_abs, t);
             sum_num   = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
             __m128i min_t = _mm_min_epi32(
               t,
               _mm_add_epi32(
                 fours,
-                _mm_and_epi32(t, ones)
+                _mm_and_si128(t, ones)
               )
             );
             sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t);
@@ -1368,53 +1367,53 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
         case 4: {
             __m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[3]));
             __m128i t     = _mm_i32gather_epi32((const int*)absLevels, offset, 1);
-            t = _mm_and_epi32(t, first_byte);
+            t = _mm_and_si128(t, first_byte);
             sum_abs = _mm_add_epi32(sum_abs, t);
             sum_num   = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
             __m128i min_t = _mm_min_epi32(
               t,
               _mm_add_epi32(
                 fours,
-                _mm_and_epi32(t, ones)));
+                _mm_and_si128(t, ones)));
             sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t);
         }
         case 3: {
             __m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[2]));
             __m128i t     = _mm_i32gather_epi32((const int*)absLevels, offset, 1);
-            t = _mm_and_epi32(t, first_byte);
+            t = _mm_and_si128(t, first_byte);
             sum_abs = _mm_add_epi32(sum_abs, t);
             sum_num   = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
             __m128i min_t = _mm_min_epi32(
               t,
               _mm_add_epi32(
                 fours,
-                _mm_and_epi32(t, ones)));
+                _mm_and_si128(t, ones)));
             sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t);
         }
         case 2: {
             __m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[1]));
             __m128i t     = _mm_i32gather_epi32((const int*)absLevels, offset, 1);
-            t = _mm_and_epi32(t, first_byte);
+            t = _mm_and_si128(t, first_byte);
             sum_abs = _mm_add_epi32(sum_abs, t);
             sum_num   = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
             __m128i min_t = _mm_min_epi32(
               t,
               _mm_add_epi32(
                 fours,
-                _mm_and_epi32(t, ones)));
+                _mm_and_si128(t, ones)));
             sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t);
         }
         case 1: {
             __m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[0]));
             __m128i t = _mm_i32gather_epi32((const int*)absLevels, offset, 1);
-            t = _mm_and_epi32(t, first_byte);
+            t = _mm_and_si128(t, first_byte);
             sum_abs = _mm_add_epi32(sum_abs, t);
             sum_num = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
             __m128i min_t = _mm_min_epi32(
               t,
               _mm_add_epi32(
                 fours,
-                _mm_and_epi32(t, ones)));
+                _mm_and_si128(t, ones)));
             sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t);
         }
             break;
@@ -1465,8 +1464,8 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
       }
     }
 
-    __m128i sum_num = _mm_and_epi32(last, _mm_set1_epi32(7));
-    __m128i sum_abs1 = _mm_and_epi32(
+    __m128i sum_num = _mm_and_si128(last, _mm_set1_epi32(7));
+    __m128i sum_abs1 = _mm_and_si128(
       _mm_srli_epi32(last, 3),
       _mm_set1_epi32(31));
 
@@ -1730,9 +1729,9 @@ static INLINE void update_states_avx2(
         (int *)state->m_absLevelsAndCtxInit[state_offset],
         _mm_add_epi32(ctx_start_offsets, _mm_set1_epi32(tinit_offset)),
         2);
-      tinit = _mm_and_epi32(tinit, first_two_bytes);
-      __m128i sum_abs1 = _mm_and_epi32(_mm_srli_epi32(tinit, 3), _mm_set1_epi32(31));
-      __m128i sum_num = _mm_and_epi32(tinit, _mm_set1_epi32(7));
+      tinit = _mm_and_si128(tinit, first_two_bytes);
+      __m128i sum_abs1 = _mm_and_si128(_mm_srli_epi32(tinit, 3), _mm_set1_epi32(31));
+      __m128i sum_num = _mm_and_si128(tinit, _mm_set1_epi32(7));
 
       uint8_t* levels = (uint8_t*)state->m_absLevelsAndCtxInit[state_offset];
       switch (numIPos) {
@@ -1742,9 +1741,9 @@ static INLINE void update_states_avx2(
             (int *)levels,
             _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[4])),
             1);
-          t = _mm_and_epi32(t, first_byte);
+          t = _mm_and_si128(t, first_byte);
           __m128i min_arg = _mm_min_epi32(
-            _mm_add_epi32(_mm_set1_epi32(4), _mm_and_epi32(t, ones)),
+            _mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)),
             t
           );
           sum_abs1 = _mm_add_epi32(
@@ -1753,7 +1752,7 @@ static INLINE void update_states_avx2(
           );
           sum_num = _mm_add_epi32(
             sum_num,
-            _mm_min_epi32(_mm_and_epi32(t, first_byte), ones));
+            _mm_min_epi32(_mm_and_si128(t, first_byte), ones));
         }
       case 4:
         {
@@ -1761,9 +1760,9 @@ static INLINE void update_states_avx2(
             (int*)levels,
             _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[3])),
             1);
-          t = _mm_and_epi32(t, first_byte);
+          t = _mm_and_si128(t, first_byte);
           __m128i min_arg = _mm_min_epi32(
-            _mm_add_epi32(_mm_set1_epi32(4), _mm_and_epi32(t, ones)),
+            _mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)),
             t
           );
           sum_abs1 = _mm_add_epi32(
@@ -1772,7 +1771,7 @@ static INLINE void update_states_avx2(
           );
           sum_num = _mm_add_epi32(
             sum_num,
-            _mm_min_epi32(_mm_and_epi32(t, first_byte), ones));
+            _mm_min_epi32(_mm_and_si128(t, first_byte), ones));
         }
       case 3:
         {
@@ -1780,9 +1779,9 @@ static INLINE void update_states_avx2(
             (int*)levels,
             _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[2])),
             1);
-          t = _mm_and_epi32(t, first_byte);
+          t = _mm_and_si128(t, first_byte);
           __m128i min_arg = _mm_min_epi32(
-            _mm_add_epi32(_mm_set1_epi32(4), _mm_and_epi32(t, ones)),
+            _mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)),
             t
           );
           sum_abs1 = _mm_add_epi32(
@@ -1791,7 +1790,7 @@ static INLINE void update_states_avx2(
           );
           sum_num = _mm_add_epi32(
             sum_num,
-            _mm_min_epi32(_mm_and_epi32(t, first_byte), ones));
+            _mm_min_epi32(_mm_and_si128(t, first_byte), ones));
         }
       case 2:
         {
@@ -1799,9 +1798,9 @@ static INLINE void update_states_avx2(
             (int*)levels,
             _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[1])),
             1);
-          t = _mm_and_epi32(t, first_byte);
+          t = _mm_and_si128(t, first_byte);
         __m128i min_arg = _mm_min_epi32(
-              _mm_add_epi32(_mm_set1_epi32(4), _mm_and_epi32(t, ones)),
+              _mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)),
               t
             );
           sum_abs1 = _mm_add_epi32(
@@ -1810,16 +1809,16 @@ static INLINE void update_states_avx2(
           );
           sum_num = _mm_add_epi32(
             sum_num,
-            _mm_min_epi32(_mm_and_epi32(t, first_byte), ones));
+            _mm_min_epi32(_mm_and_si128(t, first_byte), ones));
         }
       case 1: {
           __m128i t = _mm_i32gather_epi32(
             (int*)levels,
             _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[0])),
             1);
-          t = _mm_and_epi32(t, first_byte);
+          t = _mm_and_si128(t, first_byte);
           __m128i min_arg = _mm_min_epi32(
-            _mm_add_epi32(_mm_set1_epi32(4), _mm_and_epi32(t, ones)),
+            _mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)),
             t
           );
           sum_abs1 = _mm_add_epi32(
@@ -1828,7 +1827,7 @@ static INLINE void update_states_avx2(
             );
           sum_num = _mm_add_epi32(
             sum_num,
-            _mm_min_epi32(_mm_and_epi32(t, first_byte), ones));
+            _mm_min_epi32(_mm_and_si128(t, first_byte), ones));
         } break;
       default:
           assert(0);
@@ -1897,7 +1896,7 @@ static INLINE void update_states_avx2(
         default:
           assert(0);
       }
-      sum_abs = _mm_and_epi32(sum_abs, first_byte);
+      sum_abs = _mm_and_si128(sum_abs, first_byte);
       if (extRiceFlag) {
         assert(0 && "Not implemented for avx2");
       } else {
@@ -1925,7 +1924,7 @@ static INLINE void update_states_avx2(
         (int*)state->m_absLevelsAndCtxInit[state_offset],
         _mm_add_epi32(ctx_start_offsets, _mm_set1_epi32(tinit_offset)),
         2);
-      tinit = _mm_and_epi32(tinit, last_two_bytes);
+      tinit = _mm_and_si128(tinit, last_two_bytes);
       __m128i sum_abs = _mm_srli_epi32(tinit, 8);
       switch (numIPos) {
         case 5: {
@@ -1933,7 +1932,7 @@ static INLINE void update_states_avx2(
             (int*)levels,
             _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[4])),
             1);
-          t = _mm_and_epi32(t, last_byte);
+          t = _mm_and_si128(t, last_byte);
           sum_abs = _mm_add_epi32(sum_abs, t);
         }
         case 4: {
@@ -1941,7 +1940,7 @@ static INLINE void update_states_avx2(
             (int*)levels,
             _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[3])),
             1);
-          t = _mm_and_epi32(t, last_byte);
+          t = _mm_and_si128(t, last_byte);
           sum_abs = _mm_add_epi32(sum_abs, t);
         }
         case 3: {
@@ -1949,7 +1948,7 @@ static INLINE void update_states_avx2(
             (int*)levels,
             _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[2])),
             1);
-          t = _mm_and_epi32(t, last_byte);
+          t = _mm_and_si128(t, last_byte);
           sum_abs = _mm_add_epi32(sum_abs, t);
         }
         case 2: {
@@ -1957,7 +1956,7 @@ static INLINE void update_states_avx2(
             (int*)levels,
             _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[1])),
             1);
-          t = _mm_and_epi32(t, last_byte);
+          t = _mm_and_si128(t, last_byte);
           sum_abs = _mm_add_epi32(sum_abs, t);
         }
         case 1: {
@@ -1965,7 +1964,7 @@ static INLINE void update_states_avx2(
             (int*)levels,
             _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[0])),
             1);
-          t = _mm_and_epi32(t, last_byte);
+          t = _mm_and_si128(t, last_byte);
           sum_abs = _mm_add_epi32(sum_abs, t);
         } break;
         default:

From 8b1d6fab59931f79a0c0c47b244028822f5431e7 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 19 Apr 2023 13:07:38 +0300
Subject: [PATCH 225/254] [avx2] Replace loads and stores with non-avx512
 stores

---
 src/dep_quant.c | 154 ++++++++++++++++++++++--------------------------
 1 file changed, 69 insertions(+), 85 deletions(-)

diff --git a/src/dep_quant.c b/src/dep_quant.c
index 14659709..051cb3ae 100644
--- a/src/dep_quant.c
+++ b/src/dep_quant.c
@@ -76,9 +76,9 @@ typedef struct
 
 typedef struct
 {
-  int64_t rdCost[8];
-  int32_t absLevel[8];
-  int32_t prevId[8];
+  int64_t ALIGNED(32) rdCost[8];
+  int32_t ALIGNED(32) absLevel[8];
+  int32_t ALIGNED(32) prevId[8];
 } Decision;
 
 
@@ -118,19 +118,19 @@ typedef struct
 
 typedef struct
 {
-  int64_t m_rdCost[12];
-  uint16_t m_absLevelsAndCtxInit[12][24]; // 16x8bit for abs levels + 16x16bit for ctx init id
-  int8_t m_numSigSbb[12];
-  int m_remRegBins[12];
-  int8_t m_refSbbCtxId[12];
-  uint32_t m_sbbFracBits[12][2];
-  uint32_t m_sigFracBits[12][2];
-  int32_t m_coeffFracBits[12][6];
-  int8_t m_goRicePar[12];
-  int8_t m_goRiceZero[12];
-  int8_t m_stateId[12];
-  uint32_t m_sigFracBitsArray[12][12][2];
-  int32_t  m_gtxFracBitsArray[21][6];
+  int64_t         ALIGNED(32) m_rdCost[12];
+  uint16_t        ALIGNED(32) m_absLevelsAndCtxInit[12][24]; // 16x8bit for abs levels + 16x16bit for ctx init id
+  int8_t          ALIGNED(16) m_numSigSbb[12];
+  int             ALIGNED(32) m_remRegBins[12];
+  int8_t          ALIGNED(16) m_refSbbCtxId[12];
+  uint32_t        ALIGNED(32) m_sbbFracBits[12][2];
+  uint32_t        ALIGNED(32) m_sigFracBits[12][2];
+  int32_t         ALIGNED(32) m_coeffFracBits[12][6];
+  int8_t          ALIGNED(16) m_goRicePar[12];
+  int8_t          ALIGNED(16) m_goRiceZero[12];
+  int8_t          ALIGNED(16) m_stateId[12];
+  uint32_t        ALIGNED(32) m_sigFracBitsArray[12][12][2];
+  int32_t         ALIGNED(32) m_gtxFracBitsArray[21][6];
   common_context* m_commonCtx;
 
   unsigned effWidth;
@@ -715,10 +715,10 @@ static void check_rd_costs_avx2(const all_depquant_states* const state, const en
       }
 
       else {
-        const int pqAs[4] = {0, 0, 3, 3};
-        _mm256_storeu_epi64(temp_rd_cost_a, rd_cost_a);
-        _mm256_storeu_epi64(temp_rd_cost_b, rd_cost_b);
-        _mm256_storeu_epi64(temp_rd_cost_z, rd_cost_z);
+        const int ALIGNED(32) pqAs[4] = {0, 0, 3, 3};
+        _mm256_store_si256((__m256i*)temp_rd_cost_a, rd_cost_a);
+        _mm256_store_si256((__m256i*)temp_rd_cost_b, rd_cost_b);
+        _mm256_store_si256((__m256i*)temp_rd_cost_z, rd_cost_z);
         for (int i = 0; i < 4; i++) {
           const int state_offset = start + i;
           if (state->m_numSigSbb[state_offset]) {
@@ -729,15 +729,15 @@ static void check_rd_costs_avx2(const all_depquant_states* const state, const en
             temp_rd_cost_z[i] = decisions->rdCost[pqAs[i]];
           }
         }
-        rd_cost_a = _mm256_loadu_epi64(temp_rd_cost_a);
-        rd_cost_b = _mm256_loadu_epi64(temp_rd_cost_b);
-        rd_cost_z = _mm256_loadu_epi64(temp_rd_cost_z);
+        rd_cost_a = _mm256_loadu_si256((__m256i*)temp_rd_cost_a);
+        rd_cost_b = _mm256_loadu_si256((__m256i*)temp_rd_cost_b);
+        rd_cost_z = _mm256_loadu_si256((__m256i*)temp_rd_cost_z);
       }
     }
   } else if (state->all_lt_four) {
     __m128i scale_bits = _mm_set1_epi32(1 << SCALE_BITS);
     __m128i max_rice = _mm_set1_epi32(31);
-    __m128i go_rice_zero = _mm_cvtepi8_epi32(_mm_loadu_epi8(&state->m_goRiceZero[start]));
+    __m128i go_rice_zero = _mm_cvtepi8_epi32(_mm_loadu_si128((const __m128i*)&state->m_goRiceZero[start]));
     // RD cost A
     {
       __m128i pq_abs_a = _mm_set_epi32(pqDataA->absLevel[3], pqDataA->absLevel[3], pqDataA->absLevel[0], pqDataA->absLevel[0]);
@@ -750,7 +750,7 @@ static void check_rd_costs_avx2(const all_depquant_states* const state, const en
       __m128i selected = _mm_blendv_epi8(other, go_rice_smaller, cmp);
 
 
-      __m128i go_rice_offset = _mm_cvtepi8_epi32(_mm_loadu_si32(&state->m_goRicePar[start]));
+      __m128i go_rice_offset = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)&state->m_goRicePar[start]));
       go_rice_offset = _mm_slli_epi32(go_rice_offset, 5);
 
       __m128i offsets = _mm_add_epi32(selected, go_rice_offset);
@@ -771,7 +771,7 @@ static void check_rd_costs_avx2(const all_depquant_states* const state, const en
       __m128i selected = _mm_blendv_epi8(other, go_rice_smaller, cmp);
 
 
-      __m128i go_rice_offset = _mm_cvtepi8_epi32(_mm_loadu_si32(&state->m_goRicePar[start]));
+      __m128i go_rice_offset = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)&state->m_goRicePar[start]));
       go_rice_offset = _mm_slli_epi32(go_rice_offset, 5);
 
       __m128i offsets = _mm_add_epi32(selected, go_rice_offset);
@@ -782,7 +782,7 @@ static void check_rd_costs_avx2(const all_depquant_states* const state, const en
     }
     // RD cost Z
     {
-      __m128i go_rice_offset = _mm_cvtepi8_epi32(_mm_loadu_si32(&state->m_goRicePar[start]));
+      __m128i go_rice_offset = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)&state->m_goRicePar[start]));
       go_rice_offset = _mm_slli_epi32(go_rice_offset, 5);
 
       go_rice_offset = _mm_add_epi32(go_rice_offset, go_rice_zero);
@@ -838,17 +838,17 @@ static void check_rd_costs_avx2(const all_depquant_states* const state, const en
       temp_rd_cost_b[i] = rdCostB;
       temp_rd_cost_z[i] = rdCostZ;
     }
-    rd_cost_a = _mm256_loadu_epi64(temp_rd_cost_a);
-    rd_cost_b = _mm256_loadu_epi64(temp_rd_cost_b);
-    rd_cost_z = _mm256_loadu_epi64(temp_rd_cost_z);
+    rd_cost_a = _mm256_loadu_si256((__m256i*)temp_rd_cost_a);
+    rd_cost_b = _mm256_loadu_si256((__m256i*)temp_rd_cost_b);
+    rd_cost_z = _mm256_loadu_si256((__m256i*)temp_rd_cost_z);
   }
   rd_cost_a = _mm256_permute4x64_epi64(rd_cost_a, 216);
   rd_cost_b = _mm256_permute4x64_epi64(rd_cost_b, 141);
   rd_cost_z = _mm256_permute4x64_epi64(rd_cost_z, 216);
-  __m256i rd_cost_decision = _mm256_loadu_epi64(decisions->rdCost);
+  __m256i rd_cost_decision = _mm256_load_si256((__m256i*)decisions->rdCost);
 
-  __m256i decision_abs_coeff = _mm256_loadu_epi32(decisions->absLevel);
-  __m256i decision_prev_state = _mm256_loadu_epi32(decisions->prevId);
+  __m256i decision_abs_coeff = _mm256_load_si256((__m256i*)decisions->absLevel);
+  __m256i decision_prev_state = _mm256_load_si256((__m256i*)decisions->prevId);
   __m256i decision_data = _mm256_permute2x128_si256(decision_abs_coeff, decision_prev_state, 0x20);
   __m256i mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
   decision_data = _mm256_permutevar8x32_epi32(decision_data, mask);
@@ -869,7 +869,7 @@ static void check_rd_costs_avx2(const all_depquant_states* const state, const en
   __m256i final_rd_cost = _mm256_blendv_epi8(cheaper_first, cheaper_second, final_decision);
   __m256i final_data = _mm256_blendv_epi8(cheaper_first_data, cheaper_second_data, final_decision);
 
-  _mm256_storeu_epi64(decisions->rdCost, final_rd_cost);
+  _mm256_store_si256((__m256i*)decisions->rdCost, final_rd_cost);
   final_data = _mm256_permutevar8x32_epi32(final_data, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
   _mm256_storeu2_m128i((__m128i *)decisions->prevId, (__m128i *)decisions->absLevel, final_data);
 }
@@ -1172,8 +1172,8 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
 
   
   int state_offset = ctxs->m_curr_state_offset;
-  __m256i rd_cost = _mm256_loadu_epi64(decisions->rdCost);
-  _mm256_storeu_epi64(&ctxs->m_allStates.m_rdCost[state_offset], rd_cost);
+  __m256i rd_cost = _mm256_load_si256((__m256i const*)decisions->rdCost);
+  _mm256_store_si256((__m256i *)& ctxs->m_allStates.m_rdCost[state_offset], rd_cost);
   for (int i = 0; i < 4; ++i) {
     all_above_minus_two &= decisions->prevId[i] > -2;
     all_between_zero_and_three &= decisions->prevId[i] >= 0 && decisions->prevId[i] < 4;
@@ -1183,10 +1183,10 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
     bool all_have_previous_state = true;
     __m128i prev_state;
     __m128i prev_state_no_offset;
-    __m128i abs_level = _mm_loadu_epi32(decisions->absLevel);
+    __m128i abs_level = _mm_load_si128((const __m128i*)decisions->absLevel);
     if (all_above_four) {
       prev_state = _mm_set1_epi32(ctxs->m_skip_state_offset);
-      prev_state_no_offset = _mm_sub_epi32(_mm_loadu_epi32(decisions->prevId), _mm_set1_epi32(4));
+      prev_state_no_offset = _mm_sub_epi32(_mm_load_si128((const __m128i*)decisions->prevId), _mm_set1_epi32(4));
       prev_state = _mm_add_epi32(
         prev_state,
             prev_state_no_offset
@@ -1199,11 +1199,11 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
       prev_state_no_offset = _mm_set1_epi32(ctxs->m_prev_state_offset);
       prev_state = _mm_add_epi32(
         prev_state_no_offset,
-        _mm_loadu_epi32(decisions->prevId)
+        _mm_load_si128((const __m128i*)decisions->prevId)
       );
       __m128i control = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
       __m128i prev_state_with_ff_high_bytes = _mm_or_epi32(prev_state, _mm_set1_epi32(0xffffff00));
-      __m128i num_sig_sbb = _mm_loadu_epi32(state->m_numSigSbb);
+      __m128i num_sig_sbb = _mm_load_si128((const __m128i*)state->m_numSigSbb);
       num_sig_sbb = _mm_shuffle_epi8(num_sig_sbb, prev_state_with_ff_high_bytes);
       num_sig_sbb = _mm_add_epi32(
         num_sig_sbb,
@@ -1215,24 +1215,21 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
       memcpy(&state->m_numSigSbb[state_offset], &num_sig_sbb_s, 4);
 
       int32_t prev_state_scalar[4];
-      _mm_storeu_epi32(prev_state_scalar, prev_state);
+      _mm_storeu_si128((__m128i*)prev_state_scalar, prev_state);
       for (int i = 0; i < 4; ++i) {
         memcpy(state->m_absLevelsAndCtxInit[state_offset + i], state->m_absLevelsAndCtxInit[prev_state_scalar[i]], 16 * sizeof(uint8_t));
       }
     } else {
       int prev_state_s[4] = {-1, -1, -1, -1};
-      int prev_state_no_offset_s[4] = {-1, -1, -1, -1};
       for (int i = 0; i < 4; ++i) {
         const int decision_id = i;
         const int curr_state_offset = state_offset + i;
         if (decisions->prevId[decision_id] >= 4) {
           prev_state_s[i] = ctxs->m_skip_state_offset + (decisions->prevId[decision_id] - 4);
-          prev_state_no_offset_s[i] = decisions->prevId[decision_id] - 4;
           state->m_numSigSbb[curr_state_offset] = 0;
           memset(state->m_absLevelsAndCtxInit[curr_state_offset], 0, 16 * sizeof(uint8_t));
         } else if (decisions->prevId[decision_id] >= 0) {
           prev_state_s[i] = ctxs->m_prev_state_offset + decisions->prevId[decision_id];
-          prev_state_no_offset_s[i] = decisions->prevId[decision_id];
           state->m_numSigSbb[curr_state_offset] = state->m_numSigSbb[prev_state_s[i]] + !!decisions->absLevel[decision_id];
           memcpy(state->m_absLevelsAndCtxInit[curr_state_offset], state->m_absLevelsAndCtxInit[prev_state_s[i]], 16 * sizeof(uint8_t));
         } else {
@@ -1241,13 +1238,12 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
           all_have_previous_state = false;
         }
       }
-      prev_state = _mm_loadu_epi32(prev_state_s);
-      prev_state_no_offset = _mm_loadu_epi32(prev_state_no_offset_s);
+      prev_state = _mm_loadu_si128((__m128i const*)prev_state_s);
     }
     uint32_t level_offset = scan_pos & 15;
     __m128i  max_abs = _mm_min_epi32(abs_level, _mm_set1_epi32(32));
     uint32_t max_abs_s[4];
-    _mm_storeu_epi32(max_abs_s, max_abs);
+    _mm_storeu_si128((__m128i*)max_abs_s, max_abs);
     for (int i = 0; i < 4; ++i) {
       uint8_t* levels = (uint8_t*)state->m_absLevelsAndCtxInit[state_offset + i];
       levels[level_offset] = max_abs_s[i];
@@ -1260,7 +1256,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
       common_context* cc = &ctxs->m_common_context;
       size_t         setCpSize = cc->m_nbInfo[scan_pos - 1].maxDist * sizeof(uint8_t);
       int previous_state_array[4];
-      _mm_storeu_epi32(previous_state_array, prev_state);
+      _mm_storeu_si128((__m128i*)previous_state_array, prev_state);
       for (int curr_state = 0; curr_state < 4; ++curr_state) {
         uint8_t* sbbFlags = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset + (curr_state)].sbbFlags;
         uint8_t* levels = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset + (curr_state)].levels;
@@ -1288,17 +1284,8 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
       __m128i sig_sbb = _mm_or_epi32(sbb_right, sbb_below);
       sig_sbb         = _mm_and_si128(sig_sbb, _mm_set1_epi32(0xff));
       sig_sbb = _mm_min_epi32(sig_sbb, _mm_set1_epi32(1));
-      //__m256i sig_sbb_mask = _mm256_cvtepi32_epi64(sig_sbb);
-      //const __m256i duplication_mask = _mm256_setr_epi8(
-      //  0, 0, 0, 0, 0, 0, 0, 0,
-      //  1, 1, 1, 1, 1, 1, 1, 1, 
-      //  2, 2, 2, 2, 2, 2, 2, 2, 
-      //  3, 3, 3, 3, 3, 3, 3, 3);
-      //sig_sbb_mask = _mm256_shuffle_epi8(sig_sbb_mask, duplication_mask);
       __m256i sbb_frac_bits = _mm256_i32gather_epi64((int64_t *)cc->m_sbbFlagBits[0], sig_sbb, 8);
-      //__m256i sbb_frac_bits = _mm256_loadu_epi64(cc->m_sbbFlagBits);
-      //sbb_frac_bits = _mm256_shu
-      _mm256_storeu_epi64(state->m_sbbFracBits[state_offset], sbb_frac_bits);
+      _mm256_store_si256((__m256i*)state->m_sbbFracBits[state_offset], sbb_frac_bits);
 
       memset(&state->m_numSigSbb[state_offset], 0, 4);
       memset(&state->m_goRicePar[state_offset], 0, 4);
@@ -1307,10 +1294,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
       memcpy(&state->m_refSbbCtxId[state_offset], states, 4);
       if (all_have_previous_state) {
         __m128i rem_reg_bins = _mm_i32gather_epi32(state->m_remRegBins, prev_state, 4);
-        //prev_state_no_offset = _mm_shuffle_epi8(prev_state_no_offset, _mm_setr_epi8(0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3));
-        //__m128i rem_reg_bins = _mm_loadu_epi32(&state->m_remRegBins[previous_state_array[0] & 0xfc]);
-        //rem_reg_bins = _mm_shuffle_epi8(rem_reg_bins, mask);
-        _mm_storeu_epi32(&state->m_remRegBins[state_offset], rem_reg_bins);
+        _mm_store_si128((__m128i*) & state->m_remRegBins[state_offset], rem_reg_bins);
       } else {
         const int temp = (state->effWidth * state->effHeight * 28) / 16;
         for (int i = 0; i < 4; ++i) {
@@ -1339,7 +1323,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
         if (nbOut->num == 0) {
           temp[id % 4] = 0;
           if (id % 4 == 3) {
-            all[id / 4] = _mm256_loadu_epi64(temp);
+            all[id / 4] = _mm256_loadu_si256((__m256i const*)temp);
             all[id / 4] = _mm256_shuffle_epi8(all[id / 4], v_shuffle);
           }
           continue;
@@ -1427,8 +1411,8 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
         __m128i shuffle_mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 0, 0, 0, 0, 0, 0, 0, 0);
         __m128i shuffled_template_ctx_init = _mm_shuffle_epi8(template_ctx_init, shuffle_mask);
         temp[id % 4] = _mm_extract_epi64(shuffled_template_ctx_init, 0);
-        if (id %4 == 3) {
-          all[id / 4] = _mm256_loadu_epi64(temp);
+        if (id % 4 == 3) {
+          all[id / 4] = _mm256_loadu_si256((__m256i const*)temp);
           all[id / 4] = _mm256_shuffle_epi8(all[id / 4], v_shuffle);
           last = template_ctx_init;
         }
@@ -1454,10 +1438,10 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
       v_tmp[2] = _mm256_permute4x64_epi64(v_tmp16_hi[0], _MM_SHUFFLE(3, 1, 2, 0));
       v_tmp[3] = _mm256_permute4x64_epi64(v_tmp16_hi[1], _MM_SHUFFLE(3, 1, 2, 0));
 
-      _mm256_storeu_epi16(state->m_absLevelsAndCtxInit[state_offset] + 8,  _mm256_permute2x128_si256(v_tmp[0], v_tmp[1], 0x20));
-      _mm256_storeu_epi16(state->m_absLevelsAndCtxInit[state_offset + 1] + 8,  _mm256_permute2x128_si256(v_tmp[0], v_tmp[1], 0x31));
-      _mm256_storeu_epi16(state->m_absLevelsAndCtxInit[state_offset + 2] + 8,  _mm256_permute2x128_si256(v_tmp[2], v_tmp[3], 0x20));
-      _mm256_storeu_epi16(state->m_absLevelsAndCtxInit[state_offset + 3] + 8,  _mm256_permute2x128_si256(v_tmp[2], v_tmp[3], 0x31));
+      _mm256_store_si256((__m256i*)(state->m_absLevelsAndCtxInit[state_offset] + 8),  _mm256_permute2x128_si256(v_tmp[0], v_tmp[1], 0x20));
+      _mm256_store_si256((__m256i*)(state->m_absLevelsAndCtxInit[state_offset + 1] + 8),  _mm256_permute2x128_si256(v_tmp[0], v_tmp[1], 0x31));
+      _mm256_store_si256((__m256i*)(state->m_absLevelsAndCtxInit[state_offset + 2] + 8),  _mm256_permute2x128_si256(v_tmp[2], v_tmp[3], 0x20));
+      _mm256_store_si256((__m256i*)(state->m_absLevelsAndCtxInit[state_offset + 3] + 8),  _mm256_permute2x128_si256(v_tmp[2], v_tmp[3], 0x31));
 
       for (int i = 0; i < 4; ++i) {
         memset(state->m_absLevelsAndCtxInit[state_offset + i], 0, 16);
@@ -1479,13 +1463,13 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
     offsets = _mm_add_epi32(offsets, _mm_set1_epi32(sigCtxOffsetNext));
     offsets         = _mm_add_epi32(offsets, sum_abs_min);
     __m256i sig_frac_bits = _mm256_i32gather_epi64((const int64_t *)&state->m_sigFracBitsArray[state_offset][0][0], offsets, 8);
-    _mm256_storeu_epi64(&state->m_sigFracBits[state_offset][0], sig_frac_bits);
+    _mm256_store_si256((__m256i*)&state->m_sigFracBits[state_offset][0], sig_frac_bits);
 
 
     __m128i sum_gt1 = _mm_sub_epi32(sum_abs1, sum_num);
     __m128i min_gt1 = _mm_min_epi32(sum_gt1, _mm_set1_epi32(4));
     uint32_t sum_gt1_s[4];
-    _mm_storeu_epi32(sum_gt1_s, min_gt1);
+    _mm_storeu_si128((__m128i*)sum_gt1_s, min_gt1);
     for (int i = 0; i < 4; ++i) {
       memcpy(state->m_coeffFracBits[state_offset + i], state->m_gtxFracBitsArray[sum_gt1_s[i] + gtxCtxOffsetNext], sizeof(state->m_coeffFracBits[0]));
     }
@@ -1592,22 +1576,22 @@ static INLINE void update_states_avx2(
     all_minus_one &= decisions->prevId[i] == -1;
   }
   int state_offset = ctxs->m_curr_state_offset;
-  __m256i rd_cost = _mm256_loadu_epi64(decisions->rdCost);
-  _mm256_storeu_epi64(&ctxs->m_allStates.m_rdCost[state_offset], rd_cost);
+  __m256i rd_cost = _mm256_load_si256((__m256i const*)decisions->rdCost);
+  _mm256_store_si256((__m256i *)& ctxs->m_allStates.m_rdCost[state_offset], rd_cost);
   if (all_above_minus_two) {
 
     bool    rem_reg_all_gte_4 = true;
     bool    rem_reg_all_lt4 = true;
 
-    __m128i abs_level = _mm_loadu_epi32(decisions->absLevel);
+    __m128i abs_level = _mm_load_si128((__m128i const*)decisions->absLevel);
     if (all_non_negative) {
-      __m128i prv_states = _mm_loadu_epi32(decisions->prevId);
+      __m128i prv_states  = _mm_load_si128((__m128i const*)decisions->prevId);
       __m128i prev_offset = _mm_set1_epi32(ctxs->m_prev_state_offset);
       prv_states = _mm_add_epi32(prv_states, prev_offset);
       __m128i control = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
       __m128i shuffled_prev_states = _mm_shuffle_epi8(prv_states, control);
       
-      __m128i sig_sbb = _mm_loadu_epi32(state->m_numSigSbb);
+      __m128i sig_sbb   = _mm_load_si128((__m128i const*)state->m_numSigSbb);
       sig_sbb = _mm_shuffle_epi8(sig_sbb, shuffled_prev_states);
       __m128i has_coeff = _mm_min_epi32(abs_level, _mm_set1_epi32(1));
       has_coeff         = _mm_shuffle_epi8(has_coeff, control);
@@ -1615,19 +1599,19 @@ static INLINE void update_states_avx2(
       int sig_sbb_i = _mm_extract_epi32(sig_sbb, 0);
       memcpy(&state->m_numSigSbb[state_offset], &sig_sbb_i, 4);
       
-      __m128i ref_sbb_ctx_idx = _mm_loadu_epi32(state->m_refSbbCtxId);
+      __m128i ref_sbb_ctx_idx = _mm_load_si128((__m128i const*)state->m_refSbbCtxId);
       ref_sbb_ctx_idx = _mm_shuffle_epi8(ref_sbb_ctx_idx, shuffled_prev_states);
       int ref_sbb_ctx = _mm_extract_epi32(ref_sbb_ctx_idx, 0);
       memcpy(&state->m_refSbbCtxId[state_offset], &ref_sbb_ctx, 4);
       
-      __m128i go_rice_par = _mm_loadu_epi32(state->m_goRicePar);
+      __m128i go_rice_par = _mm_load_si128((__m128i const*)state->m_goRicePar);
       go_rice_par = _mm_shuffle_epi8(go_rice_par, shuffled_prev_states);
       int go_rice_par_i = _mm_extract_epi32(go_rice_par, 0);
       memcpy(&state->m_goRicePar[state_offset], &go_rice_par_i, 4);
 
       
       __m256i sbb_frac_bits = _mm256_i32gather_epi64((const int64_t *)state->m_sbbFracBits[0], prv_states, 8);
-      _mm256_storeu_epi64(&state->m_sbbFracBits[state_offset][0], sbb_frac_bits);
+      _mm256_store_si256((__m256i*)&state->m_sbbFracBits[state_offset][0], sbb_frac_bits);
 
       __m128i rem_reg_bins = _mm_i32gather_epi32(state->m_remRegBins, prv_states, 4);
       __m128i ones = _mm_set1_epi32(1);
@@ -1640,7 +1624,7 @@ static INLINE void update_states_avx2(
       __m128i rem_reg_bins_smaller_than_four = _mm_cmplt_epi32(rem_reg_bins, _mm_set1_epi32(4));
       reg_bins_sub = _mm_blendv_epi8(secondary, reg_bins_sub, rem_reg_bins_smaller_than_four);
       rem_reg_bins = _mm_sub_epi32(rem_reg_bins, reg_bins_sub);
-      _mm_storeu_epi32(&state->m_remRegBins[state_offset], rem_reg_bins);
+      _mm_store_si128((__m128i*)&state->m_remRegBins[state_offset], rem_reg_bins);
 
       __m128i mask = _mm_cmpgt_epi32(rem_reg_bins, _mm_set1_epi32(3)); 
       int     bit_mask = _mm_movemask_epi8(mask);           
@@ -1650,7 +1634,7 @@ static INLINE void update_states_avx2(
       rem_reg_all_lt4 = (bit_mask == 0xFFFF);
 
       int32_t prv_states_scalar[4];
-      _mm_storeu_epi32(prv_states_scalar, prv_states);
+      _mm_storeu_si128((__m128i*)prv_states_scalar, prv_states);
       for (int i = 0; i < 4; ++i) {
         memcpy(state->m_absLevelsAndCtxInit[state_offset + i], state->m_absLevelsAndCtxInit[prv_states_scalar[i]], 48 * sizeof(uint8_t));        
       }
@@ -1668,7 +1652,7 @@ static INLINE void update_states_avx2(
         _mm_cmplt_epi32(abs_level, _mm_set1_epi32(2))
       );
       rem_reg_bins = _mm_sub_epi32(rem_reg_bins, sub);
-      _mm_storeu_epi32(&state->m_remRegBins[state_offset], rem_reg_bins);
+      _mm_store_si128((__m128i*) & state->m_remRegBins[state_offset], rem_reg_bins);
 
       __m128i mask = _mm_cmpgt_epi32(rem_reg_bins, _mm_set1_epi32(3));
       int     bit_mask = _mm_movemask_epi8(mask);
@@ -1711,7 +1695,7 @@ static INLINE void update_states_avx2(
     uint32_t level_offset = scan_pos & 15;
     __m128i   max_abs = _mm_min_epi32(abs_level, _mm_set1_epi32(32));
     uint32_t max_abs_s[4];
-    _mm_storeu_epi32(max_abs_s, max_abs);
+    _mm_storeu_si128((__m128i*)max_abs_s, max_abs);
     for (int i = 0; i < 4; ++i) {
       uint8_t* levels = (uint8_t*)state->m_absLevelsAndCtxInit[state_offset + i];
       levels[level_offset] = max_abs_s[i];
@@ -1840,12 +1824,12 @@ static INLINE void update_states_avx2(
         _mm_set1_epi32(3));
       offsets = _mm_add_epi32(offsets, temp);
       __m256i sig_frac_bits = _mm256_i32gather_epi64((const int64_t *)state->m_sigFracBitsArray[state_offset][0], offsets, 8);
-      _mm256_storeu_epi64(&state->m_sigFracBits[state_offset][0], sig_frac_bits);
+      _mm256_store_si256((__m256i*)&state->m_sigFracBits[state_offset][0], sig_frac_bits);
 
       sum_gt1 = _mm_min_epi32(sum_gt1, _mm_set1_epi32(4));
       sum_gt1 = _mm_add_epi32(sum_gt1, _mm_set1_epi32(gtxCtxOffsetNext));
       uint32_t sum_gt1_s[4];
-      _mm_storeu_epi32(sum_gt1_s, sum_gt1);
+      _mm_storeu_si128((__m128i*)sum_gt1_s, sum_gt1);
       for (int i = 0; i < 4; ++i) {
         memcpy(state->m_coeffFracBits[state_offset + i], state->m_gtxFracBitsArray[sum_gt1_s[i]], sizeof(state->m_coeffFracBits[0]));
       }

From 0591342b3a50bf26d37b65944f6a569d215d0533 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 19 Apr 2023 14:00:36 +0300
Subject: [PATCH 226/254] [avx2] replace or

---
 src/dep_quant.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/dep_quant.c b/src/dep_quant.c
index 051cb3ae..39439c40 100644
--- a/src/dep_quant.c
+++ b/src/dep_quant.c
@@ -1202,7 +1202,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
         _mm_load_si128((const __m128i*)decisions->prevId)
       );
       __m128i control = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
-      __m128i prev_state_with_ff_high_bytes = _mm_or_epi32(prev_state, _mm_set1_epi32(0xffffff00));
+      __m128i prev_state_with_ff_high_bytes = _mm_or_si128(prev_state, _mm_set1_epi32(0xffffff00));
       __m128i num_sig_sbb = _mm_load_si128((const __m128i*)state->m_numSigSbb);
       num_sig_sbb = _mm_shuffle_epi8(num_sig_sbb, prev_state_with_ff_high_bytes);
       num_sig_sbb = _mm_add_epi32(
@@ -1281,7 +1281,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
       __m128i sbb_offsets_below = _mm_add_epi32(sbb_offsets, _mm_set1_epi32(next_sbb_below));
       __m128i sbb_below = next_sbb_below ? _mm_i32gather_epi32((const int *)cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].sbbFlags, sbb_offsets_below, 1) : _mm_set1_epi32(0);
 
-      __m128i sig_sbb = _mm_or_epi32(sbb_right, sbb_below);
+      __m128i sig_sbb = _mm_or_si128(sbb_right, sbb_below);
       sig_sbb         = _mm_and_si128(sig_sbb, _mm_set1_epi32(0xff));
       sig_sbb = _mm_min_epi32(sig_sbb, _mm_set1_epi32(1));
       __m256i sbb_frac_bits = _mm256_i32gather_epi64((int64_t *)cc->m_sbbFlagBits[0], sig_sbb, 8);
@@ -1595,7 +1595,7 @@ static INLINE void update_states_avx2(
       sig_sbb = _mm_shuffle_epi8(sig_sbb, shuffled_prev_states);
       __m128i has_coeff = _mm_min_epi32(abs_level, _mm_set1_epi32(1));
       has_coeff         = _mm_shuffle_epi8(has_coeff, control);
-      sig_sbb           = _mm_or_epi32(sig_sbb, has_coeff);
+      sig_sbb           = _mm_or_si128(sig_sbb, has_coeff);
       int sig_sbb_i = _mm_extract_epi32(sig_sbb, 0);
       memcpy(&state->m_numSigSbb[state_offset], &sig_sbb_i, 4);
       

From dfff9a8030f72568e3f2baf507457583a12e2013 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Mon, 17 Apr 2023 15:14:35 +0300
Subject: [PATCH 227/254] [avx2] Move dep quant stuff to strategies

---
 CMakeLists.txt                            |    1 -
 src/dep_quant.c                           | 1464 +--------------------
 src/dep_quant.h                           |  123 ++
 src/strategies/avx2/depquant-avx2.c       | 1389 +++++++++++++++++++
 src/strategies/avx2/depquant-avx2.h       |   46 +
 src/strategies/generic/depquant-generic.c |  238 ++++
 src/strategies/generic/depquant-generic.h |   50 +
 src/strategies/strategies-depquant.c      |   54 +
 src/strategies/strategies-depquant.h      |   77 ++
 src/strategies/strategies-quant.c         |   13 +-
 src/strategyselector.c                    |    4 +
 src/strategyselector.h                    |    2 +
 12 files changed, 1997 insertions(+), 1464 deletions(-)
 create mode 100644 src/strategies/avx2/depquant-avx2.c
 create mode 100644 src/strategies/avx2/depquant-avx2.h
 create mode 100644 src/strategies/generic/depquant-generic.c
 create mode 100644 src/strategies/generic/depquant-generic.h
 create mode 100644 src/strategies/strategies-depquant.c
 create mode 100644 src/strategies/strategies-depquant.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6460743b..d8c37bbc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -144,7 +144,6 @@ target_include_directories(uvg266 PUBLIC src/extras)
 target_include_directories(uvg266 PUBLIC src/strategies)
 
 file(GLOB LIB_SOURCES_STRATEGIES_AVX2 RELATIVE ${PROJECT_SOURCE_DIR} "src/strategies/avx2/*.c")
-file(GLOB LIB_SOURCES_STRATEGIES_AVX2 RELATIVE ${PROJECT_SOURCE_DIR} "src/dep_quant.c")
 file(GLOB LIB_SOURCES_STRATEGIES_SSE41 RELATIVE ${PROJECT_SOURCE_DIR} "src/strategies/sse41/*.c")
 file(GLOB LIB_SOURCES_STRATEGIES_SSE42 RELATIVE ${PROJECT_SOURCE_DIR} "src/strategies/sse42/*.c")
 
diff --git a/src/dep_quant.c b/src/dep_quant.c
index 39439c40..519e5795 100644
--- a/src/dep_quant.c
+++ b/src/dep_quant.c
@@ -39,10 +39,8 @@
 #include "transform.h"
 #include "uvg_math.h"
 #include "generic/quant-generic.h"
-#include <immintrin.h>
-
-
 
+#include "strategies-depquant.h"
 static const int32_t g_goRiceBits[4][RICEMAX] = {
     { 32768,  65536,  98304, 131072, 163840, 196608, 262144, 262144, 327680, 327680, 327680, 327680, 393216, 393216, 393216, 393216, 393216, 393216, 393216, 393216, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752},
     { 65536,  65536,  98304,  98304, 131072, 131072, 163840, 163840, 196608, 196608, 229376, 229376, 294912, 294912, 294912, 294912, 360448, 360448, 360448, 360448, 360448, 360448, 360448, 360448, 425984, 425984, 425984, 425984, 425984, 425984, 425984, 425984},
@@ -56,102 +54,6 @@ static const int g_riceShift[5] = { 0, 2, 4, 6, 8 };
 static const uint32_t g_goRiceParsCoeff[32] = { 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 2,
                                          2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3 };
 
-enum ScanPosType { SCAN_ISCSBB = 0, SCAN_SOCSBB = 1, SCAN_EOCSBB = 2 };
-
-
-
-
-typedef struct
-{
-  uint8_t* sbbFlags;
-  uint8_t* levels;
-} SbbCtx;
-
-
-typedef struct
-{
-  int32_t absLevel[4];
-  int64_t deltaDist[4];
-} PQData;
-
-typedef struct
-{
-  int64_t ALIGNED(32) rdCost[8];
-  int32_t ALIGNED(32) absLevel[8];
-  int32_t ALIGNED(32) prevId[8];
-} Decision;
-
-
-typedef struct
-{
-  const NbInfoOut* m_nbInfo;
-  uint32_t m_sbbFlagBits[2][2];
-  SbbCtx m_allSbbCtx[8];
-  int m_curr_sbb_ctx_offset;
-  int m_prev_sbb_ctx_offset;
-  uint8_t sbb_memory[8 * 1024];
-  uint8_t level_memory[8* TR_MAX_WIDTH * TR_MAX_WIDTH];
-  int num_coeff;
-} common_context;
-
-
-typedef struct
-{
-  int64_t m_rdCost;
-  uint16_t m_absLevelsAndCtxInit[24]; // 16x8bit for abs levels + 16x16bit for ctx init id
-  int8_t m_numSigSbb;
-  int m_remRegBins;
-  int8_t m_refSbbCtxId;
-  uint32_t m_sbbFracBits[2];
-  uint32_t m_sigFracBits[2];
-  int32_t m_coeffFracBits[6];
-  int8_t m_goRicePar;
-  int8_t m_goRiceZero;
-  int8_t m_stateId;
-  uint32_t *m_sigFracBitsArray[12];
-  int32_t *m_gtxFracBitsArray[21];
-  common_context* m_commonCtx;
-
-  unsigned effWidth;
-  unsigned effHeight;
-} depquant_state;
-
-typedef struct
-{
-  int64_t         ALIGNED(32) m_rdCost[12];
-  uint16_t        ALIGNED(32) m_absLevelsAndCtxInit[12][24]; // 16x8bit for abs levels + 16x16bit for ctx init id
-  int8_t          ALIGNED(16) m_numSigSbb[12];
-  int             ALIGNED(32) m_remRegBins[12];
-  int8_t          ALIGNED(16) m_refSbbCtxId[12];
-  uint32_t        ALIGNED(32) m_sbbFracBits[12][2];
-  uint32_t        ALIGNED(32) m_sigFracBits[12][2];
-  int32_t         ALIGNED(32) m_coeffFracBits[12][6];
-  int8_t          ALIGNED(16) m_goRicePar[12];
-  int8_t          ALIGNED(16) m_goRiceZero[12];
-  int8_t          ALIGNED(16) m_stateId[12];
-  uint32_t        ALIGNED(32) m_sigFracBitsArray[12][12][2];
-  int32_t         ALIGNED(32) m_gtxFracBitsArray[21][6];
-  common_context* m_commonCtx;
-
-  unsigned effWidth;
-  unsigned effHeight;
-
-  bool all_gte_four;
-  bool all_lt_four;
-} all_depquant_states;
-
-typedef struct
-{
-    common_context  m_common_context;
-    all_depquant_states m_allStates;
-    int m_curr_state_offset;
-    int m_prev_state_offset;
-    int m_skip_state_offset;
-    depquant_state       m_startState;
-    quant_block*   m_quant;
-    Decision    m_trellis[TR_MAX_WIDTH * TR_MAX_WIDTH];
-} context_store;
-
 
 int uvg_init_nb_info(encoder_control_t * encoder) {
   memset(encoder->m_scanId2NbInfoSbbArray, 0, sizeof(encoder->m_scanId2NbInfoSbbArray));
@@ -556,326 +458,8 @@ static void depquant_state_init(depquant_state* state, uint32_t sig_frac_bits[2]
   state->m_sbbFracBits[1] = 0;
 }
 
-static INLINE void checkRdCostSkipSbbZeroOut(
-  Decision* decision, 
-  const all_depquant_states* const state,
-  int decision_id, 
-  int skip_offset) {
-  int64_t rdCost = state->m_rdCost[decision_id + skip_offset] + state->m_sbbFracBits[decision_id + skip_offset][0];
-  decision->rdCost[decision_id] = rdCost;
-  decision->absLevel[decision_id] = 0;
-  decision->prevId[decision_id] = 4 + state->m_stateId[decision_id + skip_offset];
-}
 
-
-
-static void check_rd_costs_avx2(const all_depquant_states* const state, const enum ScanPosType spt, const PQData* pqDataA, Decision* decisions, int start)
-{
-  int64_t temp_rd_cost_a[4] = {0, 0, 0, 0};
-  int64_t temp_rd_cost_b[4] = {0, 0, 0, 0};
-  int64_t temp_rd_cost_z[4] = {0, 0, 0, 0};
-
-  __m256i pq_a_delta_dist = _mm256_setr_epi64x(pqDataA->deltaDist[0], pqDataA->deltaDist[0], pqDataA->deltaDist[3], pqDataA->deltaDist[3]);
-  __m256i pq_b_delta_dist = _mm256_setr_epi64x(pqDataA->deltaDist[2], pqDataA->deltaDist[2], pqDataA->deltaDist[1], pqDataA->deltaDist[1]);
-
-  __m256i rd_cost_a = _mm256_load_si256((__m256i const*)&state->m_rdCost[start]);
-  __m256i rd_cost_b = rd_cost_a;
-  __m256i rd_cost_z = rd_cost_a;
-
-  rd_cost_a = _mm256_add_epi64(rd_cost_a, pq_a_delta_dist);
-  rd_cost_b = _mm256_add_epi64(rd_cost_b, pq_b_delta_dist);
-
-
-  if (state->all_gte_four) {
-    if (pqDataA->absLevel[0] < 4 && pqDataA->absLevel[3] < 4) {
-      __m128i offsets = _mm_set_epi32(18 + pqDataA->absLevel[3], 12 + pqDataA->absLevel[3], 6 + pqDataA->absLevel[0], 0 + pqDataA->absLevel[0]);
-      __m128i coeff_frac_bits = _mm_i32gather_epi32(&state->m_coeffFracBits[start][0], offsets, 4);
-      __m256i ext_frac_bits = _mm256_cvtepi32_epi64(coeff_frac_bits);
-      rd_cost_a = _mm256_add_epi64(rd_cost_a, ext_frac_bits);
-    } else if (pqDataA->absLevel[0] >= 4 && pqDataA->absLevel[3] >= 4) {
-      __m128i value = _mm_set_epi32((pqDataA->absLevel[3] - 4) >> 1, (pqDataA->absLevel[3] - 4) >> 1, (pqDataA->absLevel[0] - 4) >> 1, (pqDataA->absLevel[0] - 4) >> 1);
-
-      __m128i offsets = _mm_set_epi32(18 + pqDataA->absLevel[3], 12 + pqDataA->absLevel[3], 6 + pqDataA->absLevel[0], 0 + pqDataA->absLevel[0]);
-      __m128i t = _mm_slli_epi32(value, 1);
-      offsets = _mm_sub_epi32(offsets, t);
-      __m128i coeff_frac_bits = _mm_i32gather_epi32(state->m_coeffFracBits[start], offsets, 4);
-
-      __m128i max_rice = _mm_set1_epi32(31);
-      value = _mm_min_epi32(value, max_rice);
-      __m128i go_rice_tab = _mm_cvtepi8_epi32(_mm_loadu_si32(&state->m_goRicePar[start]));
-      go_rice_tab = _mm_slli_epi32(go_rice_tab, 5);
-      value = _mm_add_epi32(value, go_rice_tab);
-
-      __m128i temp = _mm_add_epi32(coeff_frac_bits, _mm_i32gather_epi32(&g_goRiceBits[0][0], value, 4));
-      rd_cost_a = _mm256_add_epi64(rd_cost_a, _mm256_cvtepi32_epi64(temp));
-    } else {
-      const int pqAs[4] = {0, 0, 3, 3};
-      ALIGNED(32) int64_t rd_costs[4] = {0, 0, 0, 0}; 
-      for (int i = 0; i < 4; i++) {
-        const int      state_offset = start + i;
-        const int      pqA = pqAs[i];
-        const int32_t* goRiceTab = g_goRiceBits[state->m_goRicePar[state_offset]];
-        if (pqDataA->absLevel[pqA] < 4) {
-          rd_costs[i] = state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA]];
-        } else {
-          const coeff_t value = (pqDataA->absLevel[pqA] - 4) >> 1;
-          rd_costs[i] += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA] - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1];
-        }
-      }
-      rd_cost_a = _mm256_add_epi64(rd_cost_a, _mm256_loadu_si256((__m256i const *)&rd_costs[0]));
-    }
-
-    if (pqDataA->absLevel[1] < 4 && pqDataA->absLevel[2] < 4) {
-      __m128i offsets = _mm_set_epi32(18 + pqDataA->absLevel[1], 12 + pqDataA->absLevel[1], 6 + pqDataA->absLevel[2], 0 + pqDataA->absLevel[2]);
-      __m128i coeff_frac_bits = _mm_i32gather_epi32(state->m_coeffFracBits[start], offsets, 4);
-      __m256i ext_frac_bits = _mm256_cvtepi32_epi64(coeff_frac_bits);
-      rd_cost_b = _mm256_add_epi64(rd_cost_b, ext_frac_bits);
-    } else if (pqDataA->absLevel[1] >= 4 && pqDataA->absLevel[2] >= 4) {
-      __m128i value = _mm_set_epi32((pqDataA->absLevel[1] - 4) >> 1, (pqDataA->absLevel[1] - 4) >> 1, (pqDataA->absLevel[2] - 4) >> 1, (pqDataA->absLevel[2] - 4) >> 1);
-
-      __m128i offsets = _mm_set_epi32(18 + pqDataA->absLevel[1], 12 + pqDataA->absLevel[1], 6 + pqDataA->absLevel[2], 0 + pqDataA->absLevel[2]);
-      __m128i t = _mm_slli_epi32(value, 1);
-      offsets = _mm_sub_epi32(offsets, t);
-      __m128i coeff_frac_bits = _mm_i32gather_epi32(state->m_coeffFracBits[start], offsets, 4);
-
-      __m128i max_rice = _mm_set1_epi32(31);
-      value = _mm_min_epi32(value, max_rice);
-      __m128i go_rice_tab = _mm_cvtepi8_epi32(_mm_loadu_si32(&state->m_goRicePar[start]));
-      go_rice_tab = _mm_slli_epi32(go_rice_tab, 5);
-      value = _mm_add_epi32(value, go_rice_tab);
-
-      __m128i temp = _mm_add_epi32(coeff_frac_bits, _mm_i32gather_epi32(&g_goRiceBits[0][0], value, 4));
-      rd_cost_b = _mm256_add_epi64(rd_cost_b, _mm256_cvtepi32_epi64(temp));
-    } else {
-      const int pqBs[4] = {2, 2, 1, 1};
-      int64_t rd_costs[4] = {0, 0, 0, 0}; 
-      for (int i = 0; i < 4; i++) {
-        const int      state_offset = start + i;
-        const int      pqB = pqBs[i];
-        const int32_t* goRiceTab = g_goRiceBits[state->m_goRicePar[state_offset]];
-        if (pqDataA->absLevel[pqB] < 4) {
-          rd_costs[i] = state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqB]];
-        } else {
-          const coeff_t value = (pqDataA->absLevel[pqB] - 4) >> 1;
-          rd_costs[i] += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqB] - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1];
-        }
-      }
-      rd_cost_b =
-        _mm256_add_epi64(rd_cost_b, _mm256_loadu_si256((__m256i const *) & rd_costs[0]));
-    }
-
-    if (spt == SCAN_ISCSBB) {
-      __m256i original = _mm256_loadu_si256((__m256i const*)state->m_sigFracBits[start]);
-      __m256i even_mask = _mm256_setr_epi32(0, 2, 4, 6, -1, -1, -1, -1);
-      __m256i odd_mask = _mm256_setr_epi32(1, 3, 5, 7, -1, -1, -1, -1);
-      __m256i even = _mm256_permutevar8x32_epi32(original, even_mask);
-      __m256i odd = _mm256_permutevar8x32_epi32(original, odd_mask);
-      __m256i even_64 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(even, 0));
-      __m256i odd_64 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(odd, 0));
-      rd_cost_a = _mm256_add_epi64(rd_cost_a, odd_64);
-      rd_cost_b = _mm256_add_epi64(rd_cost_b, odd_64);
-      rd_cost_z = _mm256_add_epi64(rd_cost_z, even_64);
-    } else if (spt == SCAN_SOCSBB) {
-      __m256i original = _mm256_loadu_si256((__m256i const*)state->m_sigFracBits[start]);
-      __m256i even_mask = _mm256_setr_epi32(0, 2, 4, 6, -1, -1, -1, -1);
-      __m256i odd_mask = _mm256_setr_epi32(1, 3, 5, 7, -1, -1, -1, -1);
-      __m256i even = _mm256_permutevar8x32_epi32(original, even_mask);
-      __m256i odd = _mm256_permutevar8x32_epi32(original, odd_mask);
-      __m256i m_sigFracBits_0 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(even, 0));
-      __m256i m_sigFracBits_1 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(odd, 0));
-
-      original = _mm256_loadu_si256((__m256i const*)state->m_sbbFracBits[start]);
-      odd = _mm256_permutevar8x32_epi32(original, odd_mask);
-      __m256i m_sbbFracBits_1 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(odd, 0));
-
-      
-      rd_cost_a = _mm256_add_epi64(rd_cost_a, m_sbbFracBits_1);
-      rd_cost_b = _mm256_add_epi64(rd_cost_b, m_sbbFracBits_1);
-      rd_cost_z = _mm256_add_epi64(rd_cost_z, m_sbbFracBits_1);
-
-      rd_cost_a = _mm256_add_epi64(rd_cost_a, m_sigFracBits_1);
-      rd_cost_b = _mm256_add_epi64(rd_cost_b, m_sigFracBits_1);
-      rd_cost_z = _mm256_add_epi64(rd_cost_z, m_sigFracBits_0);
-    }
-    else {
-      if (state->m_numSigSbb[start] && state->m_numSigSbb[start + 1] && state->m_numSigSbb[start + 2] && state->m_numSigSbb[start + 3]) {
-        __m256i original = _mm256_loadu_si256((__m256i const*)state->m_sigFracBits[start]);
-        __m256i even_mask = _mm256_setr_epi32(0, 2, 4, 6, -1, -1, -1, -1);
-        __m256i odd_mask = _mm256_setr_epi32(1, 3, 5, 7, -1, -1, -1, -1);
-        __m256i even = _mm256_permutevar8x32_epi32(original, even_mask);
-        __m256i odd = _mm256_permutevar8x32_epi32(original, odd_mask);
-        __m256i even_64 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(even, 0));
-        __m256i odd_64 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(odd, 0));
-        rd_cost_a = _mm256_add_epi64(rd_cost_a, odd_64);
-        rd_cost_b = _mm256_add_epi64(rd_cost_b, odd_64);
-        rd_cost_z = _mm256_add_epi64(rd_cost_z, even_64);     
-      }
-      else if (!state->m_numSigSbb[start] && !state->m_numSigSbb[start + 1] && !state->m_numSigSbb[start + 2] && !state->m_numSigSbb[start + 3]) {
-        rd_cost_z = _mm256_setr_epi64x(decisions->rdCost[0], decisions->rdCost[0], decisions->rdCost[3], decisions->rdCost[3]);
-      }
-
-      else {
-        const int ALIGNED(32) pqAs[4] = {0, 0, 3, 3};
-        _mm256_store_si256((__m256i*)temp_rd_cost_a, rd_cost_a);
-        _mm256_store_si256((__m256i*)temp_rd_cost_b, rd_cost_b);
-        _mm256_store_si256((__m256i*)temp_rd_cost_z, rd_cost_z);
-        for (int i = 0; i < 4; i++) {
-          const int state_offset = start + i;
-          if (state->m_numSigSbb[state_offset]) {
-            temp_rd_cost_a[i] += state->m_sigFracBits[state_offset][1];
-            temp_rd_cost_b[i] += state->m_sigFracBits[state_offset][1];
-            temp_rd_cost_z[i] += state->m_sigFracBits[state_offset][0];
-          } else {
-            temp_rd_cost_z[i] = decisions->rdCost[pqAs[i]];
-          }
-        }
-        rd_cost_a = _mm256_loadu_si256((__m256i*)temp_rd_cost_a);
-        rd_cost_b = _mm256_loadu_si256((__m256i*)temp_rd_cost_b);
-        rd_cost_z = _mm256_loadu_si256((__m256i*)temp_rd_cost_z);
-      }
-    }
-  } else if (state->all_lt_four) {
-    __m128i scale_bits = _mm_set1_epi32(1 << SCALE_BITS);
-    __m128i max_rice = _mm_set1_epi32(31);
-    __m128i go_rice_zero = _mm_cvtepi8_epi32(_mm_loadu_si128((const __m128i*)&state->m_goRiceZero[start]));
-    // RD cost A
-    {
-      __m128i pq_abs_a = _mm_set_epi32(pqDataA->absLevel[3], pqDataA->absLevel[3], pqDataA->absLevel[0], pqDataA->absLevel[0]);
-      __m128i cmp = _mm_cmpgt_epi32(pq_abs_a, go_rice_zero);
-      
-      __m128i go_rice_smaller = _mm_min_epi32(pq_abs_a, max_rice);
-
-      __m128i other = _mm_sub_epi32(pq_abs_a, _mm_set1_epi32(1));
-
-      __m128i selected = _mm_blendv_epi8(other, go_rice_smaller, cmp);
-
-
-      __m128i go_rice_offset = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)&state->m_goRicePar[start]));
-      go_rice_offset = _mm_slli_epi32(go_rice_offset, 5);
-
-      __m128i offsets = _mm_add_epi32(selected, go_rice_offset);
-      __m128i go_rice_tab = _mm_i32gather_epi32(&g_goRiceBits[0][0], offsets, 4);
-      __m128i temp = _mm_add_epi32(go_rice_tab, scale_bits);
-
-      rd_cost_a = _mm256_add_epi64(rd_cost_a, _mm256_cvtepi32_epi64(temp));
-    }
-    // RD cost b
-    {
-      __m128i pq_abs_b = _mm_set_epi32(pqDataA->absLevel[1], pqDataA->absLevel[1], pqDataA->absLevel[2], pqDataA->absLevel[2]);
-      __m128i cmp = _mm_cmpgt_epi32(pq_abs_b, go_rice_zero);
-
-      __m128i go_rice_smaller = _mm_min_epi32(pq_abs_b, max_rice);
-
-      __m128i other = _mm_sub_epi32(pq_abs_b, _mm_set1_epi32(1));
-
-      __m128i selected = _mm_blendv_epi8(other, go_rice_smaller, cmp);
-
-
-      __m128i go_rice_offset = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)&state->m_goRicePar[start]));
-      go_rice_offset = _mm_slli_epi32(go_rice_offset, 5);
-
-      __m128i offsets = _mm_add_epi32(selected, go_rice_offset);
-      __m128i go_rice_tab = _mm_i32gather_epi32(&g_goRiceBits[0][0], offsets, 4);
-      __m128i temp = _mm_add_epi32(go_rice_tab, scale_bits);
-
-      rd_cost_b = _mm256_add_epi64(rd_cost_b, _mm256_cvtepi32_epi64(temp));
-    }
-    // RD cost Z
-    {
-      __m128i go_rice_offset = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)&state->m_goRicePar[start]));
-      go_rice_offset = _mm_slli_epi32(go_rice_offset, 5);
-
-      go_rice_offset = _mm_add_epi32(go_rice_offset, go_rice_zero);
-      __m128i go_rice_tab = _mm_i32gather_epi32(&g_goRiceBits[0][0], go_rice_offset, 4);
-      rd_cost_z = _mm256_add_epi64(rd_cost_z, _mm256_cvtepi32_epi64(go_rice_tab));
-    }
-  } else {
-    const int pqAs[4] = {0, 0, 3, 3};
-    const int pqBs[4] = {2, 2, 1, 1};
-    const int decision_a[4] = {0, 2, 1, 3};
-    for (int i = 0; i < 4; i++) {
-      const int      state_offset = start + i;
-      const int32_t* goRiceTab = g_goRiceBits[state->m_goRicePar[state_offset]];
-      const int pqA = pqAs[i];
-      const int pqB = pqBs[i];
-      int64_t rdCostA = state->m_rdCost[state_offset] + pqDataA->deltaDist[pqA];
-      int64_t rdCostB = state->m_rdCost[state_offset] + pqDataA->deltaDist[pqB];
-      int64_t rdCostZ = state->m_rdCost[state_offset];
-      if (state->m_remRegBins[state_offset] >= 4) {
-        if (pqDataA->absLevel[pqA] < 4) {
-          rdCostA += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA]];
-        } else {
-          const coeff_t value = (pqDataA->absLevel[pqA] - 4) >> 1;
-          rdCostA += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA] - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1];
-        }
-        if (pqDataA->absLevel[pqB] < 4) {
-          rdCostB += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqB]];
-        } else {
-          const coeff_t value = (pqDataA->absLevel[pqB] - 4) >> 1;
-          rdCostB += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqB] - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1];
-        }
-        if (spt == SCAN_ISCSBB) {
-          rdCostA += state->m_sigFracBits[state_offset][1];
-          rdCostB += state->m_sigFracBits[state_offset][1];
-          rdCostZ += state->m_sigFracBits[state_offset][0];
-        } else if (spt == SCAN_SOCSBB) {
-          rdCostA += state->m_sbbFracBits[state_offset][1] + state->m_sigFracBits[state_offset][1];
-          rdCostB += state->m_sbbFracBits[state_offset][1] + state->m_sigFracBits[state_offset][1];
-          rdCostZ += state->m_sbbFracBits[state_offset][1] + state->m_sigFracBits[state_offset][0];
-        } else if (state->m_numSigSbb[state_offset]) {
-          rdCostA += state->m_sigFracBits[state_offset][1];
-          rdCostB += state->m_sigFracBits[state_offset][1];
-          rdCostZ += state->m_sigFracBits[state_offset][0];
-        } else {
-          rdCostZ = decisions->rdCost[decision_a[i]];
-        }
-      } else {
-        rdCostA += (1 << SCALE_BITS) + goRiceTab[pqDataA->absLevel[pqA] <= state->m_goRiceZero[state_offset] ? pqDataA->absLevel[pqA] - 1 : (pqDataA->absLevel[pqA] < RICEMAX ? pqDataA->absLevel[pqA] : RICEMAX - 1)];
-        rdCostB += (1 << SCALE_BITS) + goRiceTab[pqDataA->absLevel[pqB] <= state->m_goRiceZero[state_offset] ? pqDataA->absLevel[pqB] - 1 : (pqDataA->absLevel[pqB] < RICEMAX ? pqDataA->absLevel[pqB] : RICEMAX - 1)];
-        rdCostZ += goRiceTab[state->m_goRiceZero[state_offset]];
-      }
-      temp_rd_cost_a[i] = rdCostA;
-      temp_rd_cost_b[i] = rdCostB;
-      temp_rd_cost_z[i] = rdCostZ;
-    }
-    rd_cost_a = _mm256_loadu_si256((__m256i*)temp_rd_cost_a);
-    rd_cost_b = _mm256_loadu_si256((__m256i*)temp_rd_cost_b);
-    rd_cost_z = _mm256_loadu_si256((__m256i*)temp_rd_cost_z);
-  }
-  rd_cost_a = _mm256_permute4x64_epi64(rd_cost_a, 216);
-  rd_cost_b = _mm256_permute4x64_epi64(rd_cost_b, 141);
-  rd_cost_z = _mm256_permute4x64_epi64(rd_cost_z, 216);
-  __m256i rd_cost_decision = _mm256_load_si256((__m256i*)decisions->rdCost);
-
-  __m256i decision_abs_coeff = _mm256_load_si256((__m256i*)decisions->absLevel);
-  __m256i decision_prev_state = _mm256_load_si256((__m256i*)decisions->prevId);
-  __m256i decision_data = _mm256_permute2x128_si256(decision_abs_coeff, decision_prev_state, 0x20);
-  __m256i mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
-  decision_data = _mm256_permutevar8x32_epi32(decision_data, mask);
-
-  __m256i a_data = _mm256_set_epi32(3, pqDataA->absLevel[3], 1, pqDataA->absLevel[0], 2, pqDataA->absLevel[3], 0, pqDataA->absLevel[0]);
-  __m256i b_data = _mm256_set_epi32(2, pqDataA->absLevel[1], 0, pqDataA->absLevel[2], 3, pqDataA->absLevel[1], 1, pqDataA->absLevel[2]);
-  __m256i z_data = _mm256_set_epi32(3, 0, 1, 0, 2, 0, 0, 0);
-
-  __m256i a_vs_b = _mm256_cmpgt_epi64(rd_cost_a, rd_cost_b);
-  __m256i cheaper_first = _mm256_blendv_epi8(rd_cost_a, rd_cost_b, a_vs_b);
-  __m256i cheaper_first_data = _mm256_blendv_epi8(a_data, b_data, a_vs_b);
-
-  __m256i z_vs_decision = _mm256_cmpgt_epi64(rd_cost_z, rd_cost_decision);
-  __m256i cheaper_second = _mm256_blendv_epi8(rd_cost_z, rd_cost_decision, z_vs_decision);
-  __m256i cheaper_second_data = _mm256_blendv_epi8(z_data, decision_data, z_vs_decision);
-
-  __m256i final_decision = _mm256_cmpgt_epi64(cheaper_first, cheaper_second);
-  __m256i final_rd_cost = _mm256_blendv_epi8(cheaper_first, cheaper_second, final_decision);
-  __m256i final_data = _mm256_blendv_epi8(cheaper_first_data, cheaper_second_data, final_decision);
-
-  _mm256_store_si256((__m256i*)decisions->rdCost, final_rd_cost);
-  final_data = _mm256_permutevar8x32_epi32(final_data, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
-  _mm256_storeu2_m128i((__m128i *)decisions->prevId, (__m128i *)decisions->absLevel, final_data);
-}
-
-
-static void checkRdCosts(
+void uvg_dep_quant_check_rd_costs(
   const all_depquant_states * const state,
   const enum ScanPosType            spt,
   const PQData *                    pqDataA,
@@ -950,107 +534,6 @@ static void checkRdCosts(
   }
 }
 
-static INLINE void checkRdCostSkipSbb(const all_depquant_states* const state, Decision * decisions, int decision_id, int skip_offset)
-{
-  int64_t rdCost = state->m_rdCost[skip_offset + decision_id] + state->m_sbbFracBits[skip_offset + decision_id][0];
-  if (rdCost < decisions->rdCost[decision_id])
-  {
-    decisions->rdCost[decision_id] = rdCost;
-    decisions->absLevel[decision_id] = 0;
-    decisions->prevId[decision_id] = 4 + state->m_stateId[skip_offset + decision_id];
-  }
-}
-
-static INLINE void checkRdCostStart(const depquant_state* const state, int32_t lastOffset, const PQData *pqData, Decision *decisions, int
-                                    decision_id)
-{
-  int64_t rdCost = pqData->deltaDist[decision_id] + lastOffset;
-  if (pqData->absLevel[decision_id] < 4) {
-    rdCost += state->m_coeffFracBits[pqData->absLevel[decision_id]];
-  }
-  else {
-    const coeff_t value = (pqData->absLevel[decision_id] - 4) >> 1;
-    rdCost += state->m_coeffFracBits[pqData->absLevel[decision_id] - (value << 1)]
-              + g_goRiceBits[state->m_goRicePar][value < RICEMAX ? value : RICEMAX - 1];
-  }
-  if (rdCost < decisions->rdCost[decision_id]) {
-    decisions->rdCost[decision_id] = rdCost;
-    decisions->absLevel[decision_id] = pqData->absLevel[decision_id];
-    decisions->prevId[decision_id] = -1;
-  }
-}
-
-
-static INLINE void preQuantCoeff(const quant_block * const qp, const coeff_t absCoeff, PQData* pqData, coeff_t quanCoeff)
-{
-  int64_t scaledOrg = (int64_t)(absCoeff) * quanCoeff;
-  coeff_t  qIdx = MAX(1, (coeff_t)MIN(qp->m_maxQIdx, ((scaledOrg + qp->m_QAdd) >> qp->m_QShift)));
-  int64_t scaledAdd = qIdx * qp->m_DistStepAdd - scaledOrg * qp->m_DistOrgFact;
-  int index = qIdx & 3;
-  pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
-  pqData->absLevel[index] = (++qIdx) >> 1;
-  scaledAdd += qp->m_DistStepAdd;
-  index = qIdx & 3;
-  pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
-  pqData->absLevel[index] = (++qIdx) >> 1;
-  scaledAdd += qp->m_DistStepAdd;
-  index = qIdx & 3;
-  pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
-  pqData->absLevel[index] = (++qIdx) >> 1;
-  scaledAdd += qp->m_DistStepAdd;
-  index = qIdx & 3;
-  pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
-  pqData->absLevel[index] = (++qIdx) >> 1;
-}
-
-
-static const Decision startDec = { .rdCost = {INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2},
-  .absLevel = {-1, -1, -1, -1, 0, 0, 0, 0}, .prevId = {-2, -2, -2, -2, 4, 5, 6, 7} };
-
-
-static void xDecide(
-  all_depquant_states* const all_states,
-  depquant_state* const      m_startState,
-  quant_block *              qp,
-  const enum ScanPosType     spt,
-  const coeff_t              absCoeff,
-  const int                  lastOffset,
-  Decision*                  decisions,
-  bool                       zeroOut,
-  coeff_t                    quanCoeff,
-  const int                  skip_offset,
-  const int                  prev_offset)
-{
-  memcpy(decisions, &startDec, sizeof(Decision));
-
-  if (zeroOut) {
-    if (spt == SCAN_EOCSBB) {
-      checkRdCostSkipSbbZeroOut(decisions, all_states, 0, skip_offset);
-      checkRdCostSkipSbbZeroOut(decisions, all_states, 1, skip_offset);
-      checkRdCostSkipSbbZeroOut(decisions, all_states, 2, skip_offset);
-      checkRdCostSkipSbbZeroOut(decisions, all_states, 3, skip_offset);
-    }
-    return;
-  }
-
-  PQData pqData;
-  preQuantCoeff(qp, absCoeff, &pqData, quanCoeff);
-  check_rd_costs_avx2(all_states, spt, &pqData, decisions, prev_offset);
-  //checkRdCosts(all_states, spt, &pqData, decisions, 0, 2, prev_offset + 0);
-  //checkRdCosts(all_states, spt, &pqData, decisions, 2, 0, prev_offset + 1);
-  //checkRdCosts(all_states, spt, &pqData, decisions, 1, 3, prev_offset + 2);
-  //checkRdCosts(all_states, spt, &pqData, decisions, 3, 1, prev_offset + 3);
-  if (spt == SCAN_EOCSBB) {
-    checkRdCostSkipSbb(all_states, decisions, 0, skip_offset);
-    checkRdCostSkipSbb(all_states, decisions, 1, skip_offset);
-    checkRdCostSkipSbb(all_states, decisions, 2, skip_offset);
-    checkRdCostSkipSbb(all_states, decisions, 3, skip_offset);
-  }
-
-  checkRdCostStart(m_startState, lastOffset, &pqData, decisions, 0);
-  checkRdCostStart(m_startState, lastOffset, &pqData, decisions, 2);
-}
-
 
 static INLINE unsigned templateAbsCompare(coeff_t sum)
 {
@@ -1146,354 +629,9 @@ static INLINE void update_common_context(
   memset(ctxs->m_allStates.m_absLevelsAndCtxInit[curr_state], 0, 16 * sizeof(uint8_t));
 }
 
-static INLINE void updateStateEOS(
-  context_store*  ctxs,
-  const uint32_t  scan_pos,
-  const uint32_t  cg_pos,
-  const uint32_t  sigCtxOffsetNext,
-  const uint32_t  gtxCtxOffsetNext,
-  const uint32_t  width_in_sbb,
-  const uint32_t  height_in_sbb,
-  const uint32_t  next_sbb_right,
-  const uint32_t  next_sbb_below,
-  const Decision* decisions,
-  int             decision_id);
-
-static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos, const uint32_t cg_pos,
-                                  const uint32_t sigCtxOffsetNext, const uint32_t gtxCtxOffsetNext,
-                                  const uint32_t width_in_sbb, const uint32_t height_in_sbb,
-                                  const uint32_t next_sbb_right, const uint32_t next_sbb_below,
-                                  const Decision* decisions)
-{
-  all_depquant_states* state = &ctxs->m_allStates;
-  bool all_above_minus_two = true;
-  bool all_between_zero_and_three = true;
-  bool all_above_four = true;
-
-  
-  int state_offset = ctxs->m_curr_state_offset;
-  __m256i rd_cost = _mm256_load_si256((__m256i const*)decisions->rdCost);
-  _mm256_store_si256((__m256i *)& ctxs->m_allStates.m_rdCost[state_offset], rd_cost);
-  for (int i = 0; i < 4; ++i) {
-    all_above_minus_two &= decisions->prevId[i] > -2;
-    all_between_zero_and_three &= decisions->prevId[i] >= 0 && decisions->prevId[i] < 4;
-    all_above_four &= decisions->prevId[i] >= 4;
-  }
-  if (all_above_minus_two) {
-    bool all_have_previous_state = true;
-    __m128i prev_state;
-    __m128i prev_state_no_offset;
-    __m128i abs_level = _mm_load_si128((const __m128i*)decisions->absLevel);
-    if (all_above_four) {
-      prev_state = _mm_set1_epi32(ctxs->m_skip_state_offset);
-      prev_state_no_offset = _mm_sub_epi32(_mm_load_si128((const __m128i*)decisions->prevId), _mm_set1_epi32(4));
-      prev_state = _mm_add_epi32(
-        prev_state,
-            prev_state_no_offset
-      );
-      memset(&state->m_numSigSbb[state_offset], 0, 4);
-      for (int i = 0; i < 4; ++i) {
-        memset(state->m_absLevelsAndCtxInit[state_offset + i], 0, 16 * sizeof(uint8_t));    
-      }
-    } else if (all_between_zero_and_three) {
-      prev_state_no_offset = _mm_set1_epi32(ctxs->m_prev_state_offset);
-      prev_state = _mm_add_epi32(
-        prev_state_no_offset,
-        _mm_load_si128((const __m128i*)decisions->prevId)
-      );
-      __m128i control = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
-      __m128i prev_state_with_ff_high_bytes = _mm_or_si128(prev_state, _mm_set1_epi32(0xffffff00));
-      __m128i num_sig_sbb = _mm_load_si128((const __m128i*)state->m_numSigSbb);
-      num_sig_sbb = _mm_shuffle_epi8(num_sig_sbb, prev_state_with_ff_high_bytes);
-      num_sig_sbb = _mm_add_epi32(
-        num_sig_sbb,
-        _mm_min_epi32(abs_level, _mm_set1_epi32(1))
-      );
-
-      num_sig_sbb = _mm_shuffle_epi8(num_sig_sbb, control);
-      int num_sig_sbb_s = _mm_extract_epi32(num_sig_sbb, 0);
-      memcpy(&state->m_numSigSbb[state_offset], &num_sig_sbb_s, 4);
-
-      int32_t prev_state_scalar[4];
-      _mm_storeu_si128((__m128i*)prev_state_scalar, prev_state);
-      for (int i = 0; i < 4; ++i) {
-        memcpy(state->m_absLevelsAndCtxInit[state_offset + i], state->m_absLevelsAndCtxInit[prev_state_scalar[i]], 16 * sizeof(uint8_t));
-      }
-    } else {
-      int prev_state_s[4] = {-1, -1, -1, -1};
-      for (int i = 0; i < 4; ++i) {
-        const int decision_id = i;
-        const int curr_state_offset = state_offset + i;
-        if (decisions->prevId[decision_id] >= 4) {
-          prev_state_s[i] = ctxs->m_skip_state_offset + (decisions->prevId[decision_id] - 4);
-          state->m_numSigSbb[curr_state_offset] = 0;
-          memset(state->m_absLevelsAndCtxInit[curr_state_offset], 0, 16 * sizeof(uint8_t));
-        } else if (decisions->prevId[decision_id] >= 0) {
-          prev_state_s[i] = ctxs->m_prev_state_offset + decisions->prevId[decision_id];
-          state->m_numSigSbb[curr_state_offset] = state->m_numSigSbb[prev_state_s[i]] + !!decisions->absLevel[decision_id];
-          memcpy(state->m_absLevelsAndCtxInit[curr_state_offset], state->m_absLevelsAndCtxInit[prev_state_s[i]], 16 * sizeof(uint8_t));
-        } else {
-          state->m_numSigSbb[curr_state_offset] = 1;
-          memset(state->m_absLevelsAndCtxInit[curr_state_offset], 0, 16 * sizeof(uint8_t));
-          all_have_previous_state = false;
-        }
-      }
-      prev_state = _mm_loadu_si128((__m128i const*)prev_state_s);
-    }
-    uint32_t level_offset = scan_pos & 15;
-    __m128i  max_abs = _mm_min_epi32(abs_level, _mm_set1_epi32(32));
-    uint32_t max_abs_s[4];
-    _mm_storeu_si128((__m128i*)max_abs_s, max_abs);
-    for (int i = 0; i < 4; ++i) {
-      uint8_t* levels = (uint8_t*)state->m_absLevelsAndCtxInit[state_offset + i];
-      levels[level_offset] = max_abs_s[i];
-    }
-
-    // Update common context
-    __m128i last;
-    {
-      const uint32_t numSbb = width_in_sbb * height_in_sbb;
-      common_context* cc = &ctxs->m_common_context;
-      size_t         setCpSize = cc->m_nbInfo[scan_pos - 1].maxDist * sizeof(uint8_t);
-      int previous_state_array[4];
-      _mm_storeu_si128((__m128i*)previous_state_array, prev_state);
-      for (int curr_state = 0; curr_state < 4; ++curr_state) {
-        uint8_t* sbbFlags = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset + (curr_state)].sbbFlags;
-        uint8_t* levels = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset + (curr_state)].levels;
-        const int p_state = previous_state_array[curr_state];
-        if (p_state != -1 && ctxs->m_allStates.m_refSbbCtxId[p_state] >= 0) {
-          const int prev_sbb = cc->m_prev_sbb_ctx_offset + ctxs->m_allStates.m_refSbbCtxId[p_state];
-          memcpy(sbbFlags, cc->m_allSbbCtx[prev_sbb].sbbFlags, numSbb * sizeof(uint8_t));
-          memcpy(levels + scan_pos, cc->m_allSbbCtx[prev_sbb].levels + scan_pos, setCpSize);
-        } else {
-          memset(sbbFlags, 0, numSbb * sizeof(uint8_t));
-          memset(levels + scan_pos, 0, setCpSize);
-        }
-        sbbFlags[cg_pos] = !!ctxs->m_allStates.m_numSigSbb[curr_state + state_offset];
-        memcpy(levels + scan_pos, ctxs->m_allStates.m_absLevelsAndCtxInit[curr_state + state_offset], 16 * sizeof(uint8_t));
-      }
-
-      __m128i sbb_offsets = _mm_set_epi32(3 * numSbb, 2 * numSbb, 1 * numSbb, 0);
-      __m128i next_sbb_right_m = _mm_set1_epi32(next_sbb_right);
-      __m128i sbb_offsets_right = _mm_add_epi32(sbb_offsets, next_sbb_right_m);
-      __m128i sbb_right = next_sbb_right ? _mm_i32gather_epi32((const int *)cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].sbbFlags, sbb_offsets_right, 1) : _mm_set1_epi32(0);
-
-      __m128i sbb_offsets_below = _mm_add_epi32(sbb_offsets, _mm_set1_epi32(next_sbb_below));
-      __m128i sbb_below = next_sbb_below ? _mm_i32gather_epi32((const int *)cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].sbbFlags, sbb_offsets_below, 1) : _mm_set1_epi32(0);
-
-      __m128i sig_sbb = _mm_or_si128(sbb_right, sbb_below);
-      sig_sbb         = _mm_and_si128(sig_sbb, _mm_set1_epi32(0xff));
-      sig_sbb = _mm_min_epi32(sig_sbb, _mm_set1_epi32(1));
-      __m256i sbb_frac_bits = _mm256_i32gather_epi64((int64_t *)cc->m_sbbFlagBits[0], sig_sbb, 8);
-      _mm256_store_si256((__m256i*)state->m_sbbFracBits[state_offset], sbb_frac_bits);
-
-      memset(&state->m_numSigSbb[state_offset], 0, 4);
-      memset(&state->m_goRicePar[state_offset], 0, 4);
-
-      uint8_t states[4] = {0, 1, 2, 3};
-      memcpy(&state->m_refSbbCtxId[state_offset], states, 4);
-      if (all_have_previous_state) {
-        __m128i rem_reg_bins = _mm_i32gather_epi32(state->m_remRegBins, prev_state, 4);
-        _mm_store_si128((__m128i*) & state->m_remRegBins[state_offset], rem_reg_bins);
-      } else {
-        const int temp = (state->effWidth * state->effHeight * 28) / 16;
-        for (int i = 0; i < 4; ++i) {
-          if (previous_state_array[i] != -1) {
-            state->m_remRegBins[i + state_offset] = state->m_remRegBins[previous_state_array[i]];
-          } else {
-            state->m_remRegBins[i + state_offset] = temp;
-          }
-        }
-      }
-      
-      const int        scanBeg = scan_pos - 16;
-      const NbInfoOut* nbOut = cc->m_nbInfo + scanBeg;
-      const uint8_t*   absLevels = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].levels + scanBeg;
-
-      __m128i          levels_offsets = _mm_set_epi32(cc->num_coeff * 3, cc->num_coeff * 2, cc->num_coeff * 1, 0);
-      __m128i          first_byte = _mm_set1_epi32(0xff);
-      __m128i          ones = _mm_set1_epi32(1);
-      __m128i         fours = _mm_set1_epi32(4);
-      __m256i          all[4];
-      uint64_t         temp[4];
-      const __m256i v_shuffle = _mm256_set_epi8(15, 14,  7,  6, 13, 12,  5,  4, 11, 10,  3,  2,  9,  8,  1,  0,
-                                                31, 30, 23, 22, 29, 28, 21, 20, 27, 26, 19, 18, 25, 24, 17, 16);
-
-      for (int id = 0; id < 16; id++, nbOut++) {
-        if (nbOut->num == 0) {
-          temp[id % 4] = 0;
-          if (id % 4 == 3) {
-            all[id / 4] = _mm256_loadu_si256((__m256i const*)temp);
-            all[id / 4] = _mm256_shuffle_epi8(all[id / 4], v_shuffle);
-          }
-          continue;
-        }
-        __m128i sum_abs = _mm_set1_epi32(0);
-        __m128i sum_abs_1 = _mm_set1_epi32(0);
-        __m128i sum_num = _mm_set1_epi32(0);
-        switch (nbOut->num) {
-        case 5:
-          {
-            __m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[4]));
-            __m128i t = _mm_i32gather_epi32((const int *)absLevels, offset, 1);
-            t = _mm_and_si128(t, first_byte);
-            sum_abs = _mm_add_epi32(sum_abs, t);
-            sum_num   = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
-            __m128i min_t = _mm_min_epi32(
-              t,
-              _mm_add_epi32(
-                fours,
-                _mm_and_si128(t, ones)
-              )
-            );
-            sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t);
-          }
-        case 4: {
-            __m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[3]));
-            __m128i t     = _mm_i32gather_epi32((const int*)absLevels, offset, 1);
-            t = _mm_and_si128(t, first_byte);
-            sum_abs = _mm_add_epi32(sum_abs, t);
-            sum_num   = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
-            __m128i min_t = _mm_min_epi32(
-              t,
-              _mm_add_epi32(
-                fours,
-                _mm_and_si128(t, ones)));
-            sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t);
-        }
-        case 3: {
-            __m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[2]));
-            __m128i t     = _mm_i32gather_epi32((const int*)absLevels, offset, 1);
-            t = _mm_and_si128(t, first_byte);
-            sum_abs = _mm_add_epi32(sum_abs, t);
-            sum_num   = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
-            __m128i min_t = _mm_min_epi32(
-              t,
-              _mm_add_epi32(
-                fours,
-                _mm_and_si128(t, ones)));
-            sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t);
-        }
-        case 2: {
-            __m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[1]));
-            __m128i t     = _mm_i32gather_epi32((const int*)absLevels, offset, 1);
-            t = _mm_and_si128(t, first_byte);
-            sum_abs = _mm_add_epi32(sum_abs, t);
-            sum_num   = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
-            __m128i min_t = _mm_min_epi32(
-              t,
-              _mm_add_epi32(
-                fours,
-                _mm_and_si128(t, ones)));
-            sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t);
-        }
-        case 1: {
-            __m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[0]));
-            __m128i t = _mm_i32gather_epi32((const int*)absLevels, offset, 1);
-            t = _mm_and_si128(t, first_byte);
-            sum_abs = _mm_add_epi32(sum_abs, t);
-            sum_num = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
-            __m128i min_t = _mm_min_epi32(
-              t,
-              _mm_add_epi32(
-                fours,
-                _mm_and_si128(t, ones)));
-            sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t);
-        }
-            break;
-        default:
-          assert(0);
-        }
-        sum_abs_1 = _mm_slli_epi32(sum_abs_1, 3);
-        sum_abs = _mm_slli_epi32(_mm_min_epi32(_mm_set1_epi32(127), sum_abs), 8);
-        __m128i template_ctx_init = _mm_add_epi32(sum_num, sum_abs);
-        template_ctx_init = _mm_add_epi32(template_ctx_init, sum_abs_1);
-        __m128i shuffle_mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 0, 0, 0, 0, 0, 0, 0, 0);
-        __m128i shuffled_template_ctx_init = _mm_shuffle_epi8(template_ctx_init, shuffle_mask);
-        temp[id % 4] = _mm_extract_epi64(shuffled_template_ctx_init, 0);
-        if (id % 4 == 3) {
-          all[id / 4] = _mm256_loadu_si256((__m256i const*)temp);
-          all[id / 4] = _mm256_shuffle_epi8(all[id / 4], v_shuffle);
-          last = template_ctx_init;
-        }
-      }
-
-      __m256i* v_src_tmp = all;
-
-      __m256i v_tmp[4];
-      v_tmp[0] = _mm256_permute2x128_si256(v_src_tmp[0], v_src_tmp[1], 0x20);
-      v_tmp[1] = _mm256_permute2x128_si256(v_src_tmp[0], v_src_tmp[1], 0x31);
-      v_tmp[2] = _mm256_permute2x128_si256(v_src_tmp[2], v_src_tmp[3], 0x20);
-      v_tmp[3] = _mm256_permute2x128_si256(v_src_tmp[2], v_src_tmp[3], 0x31);
-
-      __m256i v_tmp16_lo[2];
-      __m256i v_tmp16_hi[2];
-      v_tmp16_lo[0] = _mm256_unpacklo_epi32(v_tmp[0], v_tmp[1]);
-      v_tmp16_lo[1] = _mm256_unpacklo_epi32(v_tmp[2], v_tmp[3]);
-      v_tmp16_hi[0] = _mm256_unpackhi_epi32(v_tmp[0], v_tmp[1]);
-      v_tmp16_hi[1] = _mm256_unpackhi_epi32(v_tmp[2], v_tmp[3]);
-
-      v_tmp[0] = _mm256_permute4x64_epi64(v_tmp16_lo[0], _MM_SHUFFLE(3, 1, 2, 0));
-      v_tmp[1] = _mm256_permute4x64_epi64(v_tmp16_lo[1], _MM_SHUFFLE(3, 1, 2, 0));
-      v_tmp[2] = _mm256_permute4x64_epi64(v_tmp16_hi[0], _MM_SHUFFLE(3, 1, 2, 0));
-      v_tmp[3] = _mm256_permute4x64_epi64(v_tmp16_hi[1], _MM_SHUFFLE(3, 1, 2, 0));
-
-      _mm256_store_si256((__m256i*)(state->m_absLevelsAndCtxInit[state_offset] + 8),  _mm256_permute2x128_si256(v_tmp[0], v_tmp[1], 0x20));
-      _mm256_store_si256((__m256i*)(state->m_absLevelsAndCtxInit[state_offset + 1] + 8),  _mm256_permute2x128_si256(v_tmp[0], v_tmp[1], 0x31));
-      _mm256_store_si256((__m256i*)(state->m_absLevelsAndCtxInit[state_offset + 2] + 8),  _mm256_permute2x128_si256(v_tmp[2], v_tmp[3], 0x20));
-      _mm256_store_si256((__m256i*)(state->m_absLevelsAndCtxInit[state_offset + 3] + 8),  _mm256_permute2x128_si256(v_tmp[2], v_tmp[3], 0x31));
-
-      for (int i = 0; i < 4; ++i) {
-        memset(state->m_absLevelsAndCtxInit[state_offset + i], 0, 16);
-      }
-    }
-
-    __m128i sum_num = _mm_and_si128(last, _mm_set1_epi32(7));
-    __m128i sum_abs1 = _mm_and_si128(
-      _mm_srli_epi32(last, 3),
-      _mm_set1_epi32(31));
-
-    __m128i sum_abs_min = _mm_min_epi32(
-      _mm_set1_epi32(3),
-      _mm_srli_epi32(
-        _mm_add_epi32(sum_abs1, _mm_set1_epi32(1)),
-        1));
-
-    __m128i offsets = _mm_set_epi32(12 * 3, 12 * 2, 12 * 1, 12 * 0);
-    offsets = _mm_add_epi32(offsets, _mm_set1_epi32(sigCtxOffsetNext));
-    offsets         = _mm_add_epi32(offsets, sum_abs_min);
-    __m256i sig_frac_bits = _mm256_i32gather_epi64((const int64_t *)&state->m_sigFracBitsArray[state_offset][0][0], offsets, 8);
-    _mm256_store_si256((__m256i*)&state->m_sigFracBits[state_offset][0], sig_frac_bits);
 
 
-    __m128i sum_gt1 = _mm_sub_epi32(sum_abs1, sum_num);
-    __m128i min_gt1 = _mm_min_epi32(sum_gt1, _mm_set1_epi32(4));
-    uint32_t sum_gt1_s[4];
-    _mm_storeu_si128((__m128i*)sum_gt1_s, min_gt1);
-    for (int i = 0; i < 4; ++i) {
-      memcpy(state->m_coeffFracBits[state_offset + i], state->m_gtxFracBitsArray[sum_gt1_s[i] + gtxCtxOffsetNext], sizeof(state->m_coeffFracBits[0]));
-    }
-  }
-  else {
-    for (int i = 0; i < 4; i++) {
-      updateStateEOS(
-        ctxs,
-        scan_pos,
-        cg_pos,
-        sigCtxOffsetNext,
-        gtxCtxOffsetNext,
-        width_in_sbb,
-        height_in_sbb,
-        next_sbb_right,
-        next_sbb_below,
-        decisions,
-        i);
-    }
-  }
-}
-
-
-static INLINE void updateStateEOS(
+void uvg_dep_quant_update_state_eos(
   context_store*   ctxs,
   const uint32_t   scan_pos,
   const uint32_t   cg_pos,
@@ -1542,542 +680,9 @@ static INLINE void updateStateEOS(
            state->m_gtxFracBitsArray[gtxCtxOffsetNext + (sumGt1 < 4 ? sumGt1 : 4)], sizeof(state->m_coeffFracBits[0]));
   }
 }
-static INLINE void updateState(
-  context_store*  ctxs,
-  int             numIPos,
-  const uint32_t  scan_pos,
-  const Decision* decisions,
-  const uint32_t  sigCtxOffsetNext,
-  const uint32_t  gtxCtxOffsetNext,
-  const NbInfoSbb next_nb_info_ssb,
-  const int       baseLevel,
-  const bool      extRiceFlag,
-  int             decision_id);
-
-static INLINE void update_states_avx2(
-  context_store*  ctxs,
-  int             numIPos,
-  const uint32_t  scan_pos,
-  const Decision* decisions,
-  const uint32_t  sigCtxOffsetNext,
-  const uint32_t  gtxCtxOffsetNext,
-  const NbInfoSbb next_nb_info_ssb,
-  const int       baseLevel,
-  const bool      extRiceFlag)
-{
-  all_depquant_states* state = &ctxs->m_allStates;
-
-  bool all_non_negative = true;
-  bool all_above_minus_two = true;
-  bool all_minus_one = true;
-  for (int i = 0; i < 4; ++i) {
-    all_non_negative &= decisions->prevId[i] >= 0;
-    all_above_minus_two &= decisions->prevId[i] > -2;
-    all_minus_one &= decisions->prevId[i] == -1;
-  }
-  int state_offset = ctxs->m_curr_state_offset;
-  __m256i rd_cost = _mm256_load_si256((__m256i const*)decisions->rdCost);
-  _mm256_store_si256((__m256i *)& ctxs->m_allStates.m_rdCost[state_offset], rd_cost);
-  if (all_above_minus_two) {
-
-    bool    rem_reg_all_gte_4 = true;
-    bool    rem_reg_all_lt4 = true;
-
-    __m128i abs_level = _mm_load_si128((__m128i const*)decisions->absLevel);
-    if (all_non_negative) {
-      __m128i prv_states  = _mm_load_si128((__m128i const*)decisions->prevId);
-      __m128i prev_offset = _mm_set1_epi32(ctxs->m_prev_state_offset);
-      prv_states = _mm_add_epi32(prv_states, prev_offset);
-      __m128i control = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
-      __m128i shuffled_prev_states = _mm_shuffle_epi8(prv_states, control);
-      
-      __m128i sig_sbb   = _mm_load_si128((__m128i const*)state->m_numSigSbb);
-      sig_sbb = _mm_shuffle_epi8(sig_sbb, shuffled_prev_states);
-      __m128i has_coeff = _mm_min_epi32(abs_level, _mm_set1_epi32(1));
-      has_coeff         = _mm_shuffle_epi8(has_coeff, control);
-      sig_sbb           = _mm_or_si128(sig_sbb, has_coeff);
-      int sig_sbb_i = _mm_extract_epi32(sig_sbb, 0);
-      memcpy(&state->m_numSigSbb[state_offset], &sig_sbb_i, 4);
-      
-      __m128i ref_sbb_ctx_idx = _mm_load_si128((__m128i const*)state->m_refSbbCtxId);
-      ref_sbb_ctx_idx = _mm_shuffle_epi8(ref_sbb_ctx_idx, shuffled_prev_states);
-      int ref_sbb_ctx = _mm_extract_epi32(ref_sbb_ctx_idx, 0);
-      memcpy(&state->m_refSbbCtxId[state_offset], &ref_sbb_ctx, 4);
-      
-      __m128i go_rice_par = _mm_load_si128((__m128i const*)state->m_goRicePar);
-      go_rice_par = _mm_shuffle_epi8(go_rice_par, shuffled_prev_states);
-      int go_rice_par_i = _mm_extract_epi32(go_rice_par, 0);
-      memcpy(&state->m_goRicePar[state_offset], &go_rice_par_i, 4);
-
-      
-      __m256i sbb_frac_bits = _mm256_i32gather_epi64((const int64_t *)state->m_sbbFracBits[0], prv_states, 8);
-      _mm256_store_si256((__m256i*)&state->m_sbbFracBits[state_offset][0], sbb_frac_bits);
-
-      __m128i rem_reg_bins = _mm_i32gather_epi32(state->m_remRegBins, prv_states, 4);
-      __m128i ones = _mm_set1_epi32(1);
-      rem_reg_bins = _mm_sub_epi32(rem_reg_bins, ones);
-
-      __m128i reg_bins_sub = _mm_set1_epi32(0);
-      __m128i abs_level_smaller_than_two = _mm_cmplt_epi32(abs_level, _mm_set1_epi32(2));
-      __m128i secondary = _mm_blendv_epi8(_mm_set1_epi32(3), abs_level, abs_level_smaller_than_two);
-
-      __m128i rem_reg_bins_smaller_than_four = _mm_cmplt_epi32(rem_reg_bins, _mm_set1_epi32(4));
-      reg_bins_sub = _mm_blendv_epi8(secondary, reg_bins_sub, rem_reg_bins_smaller_than_four);
-      rem_reg_bins = _mm_sub_epi32(rem_reg_bins, reg_bins_sub);
-      _mm_store_si128((__m128i*)&state->m_remRegBins[state_offset], rem_reg_bins);
-
-      __m128i mask = _mm_cmpgt_epi32(rem_reg_bins, _mm_set1_epi32(3)); 
-      int     bit_mask = _mm_movemask_epi8(mask);           
-      rem_reg_all_gte_4 = (bit_mask == 0xFFFF);
-      mask = _mm_cmplt_epi32(rem_reg_bins, _mm_set1_epi32(4));
-      bit_mask = _mm_movemask_epi8(mask); 
-      rem_reg_all_lt4 = (bit_mask == 0xFFFF);
-
-      int32_t prv_states_scalar[4];
-      _mm_storeu_si128((__m128i*)prv_states_scalar, prv_states);
-      for (int i = 0; i < 4; ++i) {
-        memcpy(state->m_absLevelsAndCtxInit[state_offset + i], state->m_absLevelsAndCtxInit[prv_states_scalar[i]], 48 * sizeof(uint8_t));        
-      }
-    }
-    else if (all_minus_one) {
-      memset(&state->m_numSigSbb[state_offset], 1, 4);
-      memset(&state->m_refSbbCtxId[state_offset], -1, 4);
-
-      const int a = (state->effWidth * state->effHeight * 28) / 16;
-
-      __m128i   rem_reg_bins = _mm_set1_epi32(a);
-      __m128i   sub = _mm_blendv_epi8(
-        _mm_set1_epi32(3),
-        abs_level,
-        _mm_cmplt_epi32(abs_level, _mm_set1_epi32(2))
-      );
-      rem_reg_bins = _mm_sub_epi32(rem_reg_bins, sub);
-      _mm_store_si128((__m128i*) & state->m_remRegBins[state_offset], rem_reg_bins);
-
-      __m128i mask = _mm_cmpgt_epi32(rem_reg_bins, _mm_set1_epi32(3));
-      int     bit_mask = _mm_movemask_epi8(mask);
-      rem_reg_all_gte_4 = (bit_mask == 0xFFFF);
-      mask = _mm_cmplt_epi32(rem_reg_bins, _mm_set1_epi32(4));
-      bit_mask = _mm_movemask_epi8(mask);
-      rem_reg_all_lt4 = (bit_mask == 0xFFFF);
-      
-      memset(state->m_absLevelsAndCtxInit[state_offset], 0, 48 * sizeof(uint8_t) * 4);
-      
-    }
-    else {
-      for (int i = 0; i< 4; ++i) {
-        const int decision_id = i;
-        const int state_id = state_offset + i;
-        if (decisions->prevId[decision_id] >= 0) {
-          const int prvState = ctxs->m_prev_state_offset + decisions->prevId[decision_id];
-          state->m_numSigSbb[state_id] = (state->m_numSigSbb[prvState]) || !!decisions->absLevel[decision_id];
-          state->m_refSbbCtxId[state_id] = state->m_refSbbCtxId[prvState];
-          state->m_sbbFracBits[state_id][0] = state->m_sbbFracBits[prvState][0];
-          state->m_sbbFracBits[state_id][1] = state->m_sbbFracBits[prvState][1];
-          state->m_remRegBins[state_id] = state->m_remRegBins[prvState] - 1;
-          state->m_goRicePar[state_id] = state->m_goRicePar[prvState];
-          if (state->m_remRegBins[state_id] >= 4) {
-            state->m_remRegBins[state_id] -= (decisions->absLevel[decision_id] < 2 ? (unsigned)decisions->absLevel[decision_id] : 3);
-          }
-          memcpy(state->m_absLevelsAndCtxInit[state_id], state->m_absLevelsAndCtxInit[prvState], 48 * sizeof(uint8_t));
-        } else {
-          state->m_numSigSbb[state_id] = 1;
-          state->m_refSbbCtxId[state_id] = -1;
-          int ctxBinSampleRatio = 28;
-          //(scanInfo.chType == CHANNEL_TYPE_LUMA) ? MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_LUMA : MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_CHROMA;
-          state->m_remRegBins[state_id] = (state->effWidth * state->effHeight * ctxBinSampleRatio) / 16 - (decisions->absLevel[decision_id] < 2 ? (unsigned)decisions->absLevel[decision_id] : 3);
-          memset(state->m_absLevelsAndCtxInit[state_id], 0, 48 * sizeof(uint8_t));
-        }
-        rem_reg_all_gte_4 &= state->m_remRegBins[state_id] >= 4;
-        rem_reg_all_lt4 &= state->m_remRegBins[state_id] < 4;
-      }
-    }
-    uint32_t level_offset = scan_pos & 15;
-    __m128i   max_abs = _mm_min_epi32(abs_level, _mm_set1_epi32(32));
-    uint32_t max_abs_s[4];
-    _mm_storeu_si128((__m128i*)max_abs_s, max_abs);
-    for (int i = 0; i < 4; ++i) {
-      uint8_t* levels = (uint8_t*)state->m_absLevelsAndCtxInit[state_offset + i];
-      levels[level_offset] = max_abs_s[i];
-    }
-    state->all_gte_four = rem_reg_all_gte_4;
-    state->all_lt_four = rem_reg_all_lt4;
-    if (rem_reg_all_gte_4) {
-      const __m128i  first_two_bytes = _mm_set1_epi32(0xffff);
-      const __m128i  first_byte = _mm_set1_epi32(0xff);
-      const __m128i  ones = _mm_set1_epi32(1);
-      const uint32_t tinit_offset = MIN(level_offset - 1u, 15u) + 8;
-      const __m128i levels_start_offsets = _mm_set_epi32(48 * 3, 48 * 2, 48 * 1, 48 * 0);
-      const __m128i ctx_start_offsets = _mm_srli_epi32(levels_start_offsets, 1);
-      __m128i        tinit = _mm_i32gather_epi32(
-        (int *)state->m_absLevelsAndCtxInit[state_offset],
-        _mm_add_epi32(ctx_start_offsets, _mm_set1_epi32(tinit_offset)),
-        2);
-      tinit = _mm_and_si128(tinit, first_two_bytes);
-      __m128i sum_abs1 = _mm_and_si128(_mm_srli_epi32(tinit, 3), _mm_set1_epi32(31));
-      __m128i sum_num = _mm_and_si128(tinit, _mm_set1_epi32(7));
-
-      uint8_t* levels = (uint8_t*)state->m_absLevelsAndCtxInit[state_offset];
-      switch (numIPos) {
-      case 5:
-        {
-          __m128i t = _mm_i32gather_epi32(
-            (int *)levels,
-            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[4])),
-            1);
-          t = _mm_and_si128(t, first_byte);
-          __m128i min_arg = _mm_min_epi32(
-            _mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)),
-            t
-          );
-          sum_abs1 = _mm_add_epi32(
-            sum_abs1,
-            min_arg
-          );
-          sum_num = _mm_add_epi32(
-            sum_num,
-            _mm_min_epi32(_mm_and_si128(t, first_byte), ones));
-        }
-      case 4:
-        {
-          __m128i t = _mm_i32gather_epi32(
-            (int*)levels,
-            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[3])),
-            1);
-          t = _mm_and_si128(t, first_byte);
-          __m128i min_arg = _mm_min_epi32(
-            _mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)),
-            t
-          );
-          sum_abs1 = _mm_add_epi32(
-            sum_abs1,
-            min_arg
-          );
-          sum_num = _mm_add_epi32(
-            sum_num,
-            _mm_min_epi32(_mm_and_si128(t, first_byte), ones));
-        }
-      case 3:
-        {
-          __m128i t = _mm_i32gather_epi32(
-            (int*)levels,
-            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[2])),
-            1);
-          t = _mm_and_si128(t, first_byte);
-          __m128i min_arg = _mm_min_epi32(
-            _mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)),
-            t
-          );
-          sum_abs1 = _mm_add_epi32(
-            sum_abs1,
-            min_arg
-          );
-          sum_num = _mm_add_epi32(
-            sum_num,
-            _mm_min_epi32(_mm_and_si128(t, first_byte), ones));
-        }
-      case 2:
-        {
-          __m128i t = _mm_i32gather_epi32(
-            (int*)levels,
-            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[1])),
-            1);
-          t = _mm_and_si128(t, first_byte);
-        __m128i min_arg = _mm_min_epi32(
-              _mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)),
-              t
-            );
-          sum_abs1 = _mm_add_epi32(
-            sum_abs1,
-            min_arg
-          );
-          sum_num = _mm_add_epi32(
-            sum_num,
-            _mm_min_epi32(_mm_and_si128(t, first_byte), ones));
-        }
-      case 1: {
-          __m128i t = _mm_i32gather_epi32(
-            (int*)levels,
-            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[0])),
-            1);
-          t = _mm_and_si128(t, first_byte);
-          __m128i min_arg = _mm_min_epi32(
-            _mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)),
-            t
-          );
-          sum_abs1 = _mm_add_epi32(
-            sum_abs1,
-            min_arg
-            );
-          sum_num = _mm_add_epi32(
-            sum_num,
-            _mm_min_epi32(_mm_and_si128(t, first_byte), ones));
-        } break;
-      default:
-          assert(0);
-      }
-      __m128i sum_gt1 = _mm_sub_epi32(sum_abs1, sum_num);
-      __m128i  offsets = _mm_set_epi32(12 * 3, 12 * 2, 12 * 1, 12 * 0);
-      offsets = _mm_add_epi32(offsets, _mm_set1_epi32(sigCtxOffsetNext));
-      __m128i temp = _mm_min_epi32(
-        _mm_srli_epi32(_mm_add_epi32(sum_abs1, ones), 1),
-        _mm_set1_epi32(3));
-      offsets = _mm_add_epi32(offsets, temp);
-      __m256i sig_frac_bits = _mm256_i32gather_epi64((const int64_t *)state->m_sigFracBitsArray[state_offset][0], offsets, 8);
-      _mm256_store_si256((__m256i*)&state->m_sigFracBits[state_offset][0], sig_frac_bits);
-
-      sum_gt1 = _mm_min_epi32(sum_gt1, _mm_set1_epi32(4));
-      sum_gt1 = _mm_add_epi32(sum_gt1, _mm_set1_epi32(gtxCtxOffsetNext));
-      uint32_t sum_gt1_s[4];
-      _mm_storeu_si128((__m128i*)sum_gt1_s, sum_gt1);
-      for (int i = 0; i < 4; ++i) {
-        memcpy(state->m_coeffFracBits[state_offset + i], state->m_gtxFracBitsArray[sum_gt1_s[i]], sizeof(state->m_coeffFracBits[0]));
-      }
-
-      __m128i sum_abs = _mm_srli_epi32(tinit, 8);
-      sum_abs = _mm_min_epi32(sum_abs, _mm_set1_epi32(32));
-      switch (numIPos) {
-        case 5:
-          {
-          __m128i t = _mm_i32gather_epi32(
-            (int*)levels,
-            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[4])),
-            1);
-          sum_abs = _mm_add_epi32(t, sum_abs);
-          }
-        case 4:
-          {
-          __m128i t = _mm_i32gather_epi32(
-            (int*)levels,
-            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[3])),
-            1);
-          sum_abs = _mm_add_epi32(t, sum_abs);
-          }
-        case 3:
-          {
-          __m128i t = _mm_i32gather_epi32(
-            (int*)levels,
-            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[2])),
-            1);
-          sum_abs = _mm_add_epi32(t, sum_abs);
-          }
-        case 2:
-          {
-          __m128i t = _mm_i32gather_epi32(
-            (int*)levels,
-            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[1])),
-            1);
-          sum_abs = _mm_add_epi32(t, sum_abs);
-          }
-        case 1:
-          {
-          __m128i t = _mm_i32gather_epi32(
-            (int*)levels,
-            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[0])),
-            1);
-          sum_abs = _mm_add_epi32(t, sum_abs);
-          } break;
-        default:
-          assert(0);
-      }
-      sum_abs = _mm_and_si128(sum_abs, first_byte);
-      if (extRiceFlag) {
-        assert(0 && "Not implemented for avx2");
-      } else {
-        __m128i sum_all = _mm_max_epi32(
-          _mm_min_epi32(
-            _mm_set1_epi32(31),
-            _mm_sub_epi32(sum_abs, _mm_set1_epi32(20))),
-          _mm_set1_epi32(0));
-        __m128i temp = _mm_i32gather_epi32(g_goRiceParsCoeff, sum_all, 4);
-        __m128i control = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
-        __m128i go_rice_par = _mm_shuffle_epi8(temp, control);
-        int     go_rice_par_i = _mm_extract_epi32(go_rice_par, 0);
-        memcpy(&state->m_goRicePar[state_offset], &go_rice_par_i, 4);
-      }
-    }
-
-    else if (rem_reg_all_lt4) {
-      uint8_t*       levels = (uint8_t*)state->m_absLevelsAndCtxInit[state_offset];
-      const __m128i  last_two_bytes = _mm_set1_epi32(0xffff);
-      const __m128i  last_byte = _mm_set1_epi32(0xff);
-      const uint32_t tinit_offset = MIN(level_offset - 1u, 15u) + 8;
-      const __m128i levels_start_offsets = _mm_set_epi32(48 * 3, 48 * 2, 48 * 1, 48 * 0);
-      const __m128i ctx_start_offsets = _mm_srli_epi32(levels_start_offsets, 1);
-      __m128i       tinit = _mm_i32gather_epi32(
-        (int*)state->m_absLevelsAndCtxInit[state_offset],
-        _mm_add_epi32(ctx_start_offsets, _mm_set1_epi32(tinit_offset)),
-        2);
-      tinit = _mm_and_si128(tinit, last_two_bytes);
-      __m128i sum_abs = _mm_srli_epi32(tinit, 8);
-      switch (numIPos) {
-        case 5: {
-          __m128i t = _mm_i32gather_epi32(
-            (int*)levels,
-            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[4])),
-            1);
-          t = _mm_and_si128(t, last_byte);
-          sum_abs = _mm_add_epi32(sum_abs, t);
-        }
-        case 4: {
-          __m128i t = _mm_i32gather_epi32(
-            (int*)levels,
-            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[3])),
-            1);
-          t = _mm_and_si128(t, last_byte);
-          sum_abs = _mm_add_epi32(sum_abs, t);
-        }
-        case 3: {
-          __m128i t = _mm_i32gather_epi32(
-            (int*)levels,
-            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[2])),
-            1);
-          t = _mm_and_si128(t, last_byte);
-          sum_abs = _mm_add_epi32(sum_abs, t);
-        }
-        case 2: {
-          __m128i t = _mm_i32gather_epi32(
-            (int*)levels,
-            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[1])),
-            1);
-          t = _mm_and_si128(t, last_byte);
-          sum_abs = _mm_add_epi32(sum_abs, t);
-        }
-        case 1: {
-          __m128i t = _mm_i32gather_epi32(
-            (int*)levels,
-            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[0])),
-            1);
-          t = _mm_and_si128(t, last_byte);
-          sum_abs = _mm_add_epi32(sum_abs, t);
-        } break;
-        default:
-          assert(0);
-      }
-      if (extRiceFlag) {
-        assert(0 && "Not implemented for avx2");
-      } else {
-        __m128i sum_all = _mm_min_epi32(_mm_set1_epi32(31), sum_abs);
-        __m128i temp = _mm_i32gather_epi32(g_goRiceParsCoeff, sum_all, 4);
-        __m128i control = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
-        __m128i go_rice_par = _mm_shuffle_epi8(temp, control);
-        int     go_rice_par_i = _mm_extract_epi32(go_rice_par, 0);
-        memcpy(&state->m_goRicePar[state_offset], &go_rice_par_i, 4);
-
-        
-        for (int i = 0; i < 4; ++i) {
-          state->m_goRiceZero[state_offset + i] = (i < 2 ? 1 : 2) << state->m_goRicePar[state_offset + i];
-          
-        }
-
-      }
-
-    }
-    else {
-      for (int i = 0; i < 4; ++i) {
-        const int state_id = state_offset + i;
-        uint8_t*  levels = (uint8_t*)(state->m_absLevelsAndCtxInit[state_id]);
-        if (state->m_remRegBins[state_id] >= 4) {
-          coeff_t tinit = state->m_absLevelsAndCtxInit[state_id][8 + ((scan_pos - 1) & 15)];
-          coeff_t sumAbs1 = (tinit >> 3) & 31;
-          coeff_t sumNum = tinit & 7;
-#define UPDATE(k)                                  \
-  {                                                \
-    coeff_t t = levels[next_nb_info_ssb.inPos[k]]; \
-    sumAbs1 += MIN(4 + (t & 1), t);                \
-    sumNum += !!t;                                 \
-  }
-          switch (numIPos) {
-            case 5: UPDATE(4);
-            case 4: UPDATE(3);
-            case 3: UPDATE(2);
-            case 2: UPDATE(1);
-            case 1: UPDATE(0); break;
-            default: assert(0);
-          }
-#undef UPDATE
-          coeff_t sumGt1 = sumAbs1 - sumNum;
-          state->m_sigFracBits[state_id][0] = state->m_sigFracBitsArray[state_id][sigCtxOffsetNext + MIN((sumAbs1 + 1) >> 1, 3)][0];
-          state->m_sigFracBits[state_id][1] = state->m_sigFracBitsArray[state_id][sigCtxOffsetNext + MIN((sumAbs1 + 1) >> 1, 3)][1];
-          memcpy(state->m_coeffFracBits[state_id], state->m_gtxFracBitsArray[gtxCtxOffsetNext + (sumGt1 < 4 ? sumGt1 : 4)], sizeof(state->m_coeffFracBits[0]));
 
 
-          coeff_t sumAbs = state->m_absLevelsAndCtxInit[state_id][8 + ((scan_pos - 1) & 15)] >> 8;
-#define UPDATE(k)                                  \
-  {                                                \
-    coeff_t t = levels[next_nb_info_ssb.inPos[k]]; \
-    sumAbs += t;                                   \
-  }
-          switch (numIPos) {
-            case 5: UPDATE(4);
-            case 4: UPDATE(3);
-            case 3: UPDATE(2);
-            case 2: UPDATE(1);
-            case 1: UPDATE(0); break;
-            default: assert(0);
-          }
-#undef UPDATE
-          if (extRiceFlag) {
-            unsigned currentShift = templateAbsCompare(sumAbs);
-            sumAbs = sumAbs >> currentShift;
-            int sumAll = MAX(MIN(31, (int)sumAbs - (int)baseLevel), 0);
-            state->m_goRicePar[state_id] = g_goRiceParsCoeff[sumAll];
-            state->m_goRicePar[state_id] += currentShift;
-          } else {
-            int sumAll = MAX(MIN(31, (int)sumAbs - 4 * 5), 0);
-            state->m_goRicePar[state_id] = g_goRiceParsCoeff[sumAll];
-          }
-        } else {
-          coeff_t sumAbs = (state->m_absLevelsAndCtxInit[state_id][8 + ((scan_pos - 1) & 15)]) >> 8;
-#define UPDATE(k)                                  \
-  {                                                \
-    coeff_t t = levels[next_nb_info_ssb.inPos[k]]; \
-    sumAbs += t;                                   \
-  }
-          switch (numIPos) {
-            case 5: UPDATE(4);
-            case 4: UPDATE(3);
-            case 3: UPDATE(2);
-            case 2: UPDATE(1);
-            case 1: UPDATE(0); break;
-            default: assert(0);
-          }
-#undef UPDATE
-          if (extRiceFlag) {
-            unsigned currentShift = templateAbsCompare(sumAbs);
-            sumAbs = sumAbs >> currentShift;
-            sumAbs = MIN(31, sumAbs);
-            state->m_goRicePar[state_id] = g_goRiceParsCoeff[sumAbs];
-            state->m_goRicePar[state_id] += currentShift;
-          } else {
-            sumAbs = MIN(31, sumAbs);
-            state->m_goRicePar[state_id] = g_goRiceParsCoeff[sumAbs];
-          }
-          state->m_goRiceZero[state_id] = ((state_id & 3) < 2 ? 1 : 2) << state->m_goRicePar[state_id];
-        }
-      }
-    }
-  } else {
-    for (int i = 0; i < 4; ++i) {
-      state->all_gte_four = true;
-      state->all_lt_four = true;
-      updateState(
-        ctxs,
-        numIPos,
-        scan_pos,
-        decisions,
-        sigCtxOffsetNext,
-        gtxCtxOffsetNext,
-        next_nb_info_ssb,
-        baseLevel,
-        extRiceFlag,
-        i);
-    }
-  }
-}
-
-
-static INLINE void updateState(
+void uvg_dep_quant_update_state(
   context_store * ctxs,
   int             numIPos,
   const uint32_t  scan_pos,
@@ -2090,7 +695,7 @@ static INLINE void updateState(
   int             decision_id) {
   all_depquant_states* state = &ctxs->m_allStates;
   int state_id = ctxs->m_curr_state_offset + decision_id;
-  // state->m_rdCost[state_id] = decisions->rdCost[decision_id];
+  state->m_rdCost[state_id] = decisions->rdCost[decision_id];
   if (decisions->prevId[decision_id] > -2) {
     if (decisions->prevId[decision_id] >= 0) {
       const int prvState = ctxs->m_prev_state_offset + decisions->prevId[decision_id];
@@ -2200,61 +805,6 @@ static INLINE void updateState(
 }
 
 static bool same[13];
-static void xDecideAndUpdate(
-  rate_estimator_t*                         re,
-  context_store*                          ctxs,
-  struct dep_quant_scan_info const* const scan_info,
-  const coeff_t                           absCoeff,
-  const uint32_t                          scan_pos,
-  const uint32_t                          width_in_sbb,
-  const uint32_t                          height_in_sbb,
-  const NbInfoSbb                         next_nb_info_ssb,
-  bool                                    zeroOut,
-  coeff_t                                 quantCoeff,
-  const uint32_t                          effWidth,
-  const uint32_t                          effHeight,
-  bool                                    is_chroma)
-{
-  Decision* decisions = &ctxs->m_trellis[scan_pos];
-  SWAP(ctxs->m_curr_state_offset, ctxs->m_prev_state_offset, int);
-
-  enum ScanPosType spt = 0;
-  if ((scan_pos & 15) == 15 && scan_pos > 16 && scan_pos < effHeight * effWidth - 1)
-  {
-    spt = SCAN_SOCSBB;
-  }
-  else if ((scan_pos & 15) == 0 && scan_pos > 0 && scan_pos < effHeight * effWidth - 16)
-  {
-    spt = SCAN_EOCSBB;
-  }
-
-  xDecide(&ctxs->m_allStates, &ctxs->m_startState, ctxs->m_quant, spt, absCoeff, re->m_lastBitsX[scan_info->pos_x] + re->m_lastBitsY[scan_info->pos_y], decisions, zeroOut, quantCoeff,ctxs->m_skip_state_offset, ctxs->m_prev_state_offset);
-
-  if (scan_pos) {
-    if (!(scan_pos & 15)) {
-      SWAP(ctxs->m_common_context.m_curr_sbb_ctx_offset, ctxs->m_common_context.m_prev_sbb_ctx_offset, int);
-      update_state_eos_avx2(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions);
-      //updateStateEOS(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions, 0);
-      //updateStateEOS(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions, 1);
-      //updateStateEOS(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions, 2);
-      //updateStateEOS(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions, 3);
-      memcpy(decisions->prevId + 4, decisions->prevId, 4 * sizeof(int32_t));
-      memcpy(decisions->absLevel + 4, decisions->absLevel, 4 * sizeof(int32_t));
-      memcpy(decisions->rdCost + 4, decisions->rdCost, 4 * sizeof(int64_t));
-    } else if (!zeroOut) {
-      update_states_avx2(ctxs, next_nb_info_ssb.num, scan_pos, decisions, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], next_nb_info_ssb, 4, false);
-    /*  updateState(ctxs, next_nb_info_ssb.num, scan_pos, decisions, sigCtxOffsetNext, gtxCtxOffsetNext, next_nb_info_ssb, 4, false, 0);
-      updateState(ctxs, next_nb_info_ssb.num, scan_pos, decisions, sigCtxOffsetNext, gtxCtxOffsetNext, next_nb_info_ssb, 4, false, 1);
-      updateState(ctxs, next_nb_info_ssb.num, scan_pos, decisions, sigCtxOffsetNext, gtxCtxOffsetNext, next_nb_info_ssb, 4, false, 2);
-      updateState(ctxs, next_nb_info_ssb.num, scan_pos, decisions, sigCtxOffsetNext, gtxCtxOffsetNext, next_nb_info_ssb, 4, false, 3);*/
-    }
-
-    if (spt == SCAN_SOCSBB) {
-      SWAP(ctxs->m_skip_state_offset, ctxs->m_prev_state_offset, int);
-    }
-  }
-}
-
 
 int uvg_dep_quant(
   const encoder_state_t* const state,
@@ -2419,7 +969,7 @@ int uvg_dep_quant(
     if (enableScalingLists) {
       init_quant_block(state, dep_quant_context.m_quant, cur_tu, log2_tr_width, log2_tr_height, compID, needs_block_size_trafo_scale, q_coeff[blkpos]);
 
-      xDecideAndUpdate(
+      uvg_dep_quant_decide_and_update(
         rate_estimator,
         ctxs,
         scan_info,
@@ -2436,7 +986,7 @@ int uvg_dep_quant(
         ); //tu.cu->slice->getReverseLastSigCoeffFlag());
     }
     else {
-      xDecideAndUpdate(
+      uvg_dep_quant_decide_and_update(
         rate_estimator,
         ctxs,
         scan_info,
diff --git a/src/dep_quant.h b/src/dep_quant.h
index ebb54d31..676d1bab 100644
--- a/src/dep_quant.h
+++ b/src/dep_quant.h
@@ -46,6 +46,8 @@
 
 typedef struct encoder_control_t encoder_control_t;
 
+enum ScanPosType { SCAN_ISCSBB = 0, SCAN_SOCSBB = 1, SCAN_EOCSBB = 2 };
+
 struct dep_quant_scan_info
 {
   uint8_t sig_ctx_offset[2];
@@ -97,6 +99,91 @@ typedef struct
   uint16_t outPos[5];
 } NbInfoOut;
 
+typedef struct {
+  int32_t absLevel[4];
+  int64_t deltaDist[4];
+} PQData;
+
+typedef struct {
+  int64_t ALIGNED(32) rdCost[8];
+  int32_t ALIGNED(32) absLevel[8];
+  int32_t ALIGNED(32) prevId[8];
+} Decision;
+
+
+typedef struct {
+  uint8_t* sbbFlags;
+  uint8_t* levels;
+} SbbCtx;
+
+typedef struct {
+  const NbInfoOut* m_nbInfo;
+  uint32_t         m_sbbFlagBits[2][2];
+  SbbCtx           m_allSbbCtx[8];
+  int              m_curr_sbb_ctx_offset;
+  int              m_prev_sbb_ctx_offset;
+  uint8_t          sbb_memory[8 * 1024];
+  uint8_t          level_memory[8 * TR_MAX_WIDTH * TR_MAX_WIDTH];
+  int              num_coeff;
+} common_context;
+
+
+typedef struct {
+  int64_t  m_rdCost;
+  uint16_t m_absLevelsAndCtxInit
+    [24]; // 16x8bit for abs levels + 16x16bit for ctx init id
+  int8_t          m_numSigSbb;
+  int             m_remRegBins;
+  int8_t          m_refSbbCtxId;
+  uint32_t        m_sbbFracBits[2];
+  uint32_t        m_sigFracBits[2];
+  int32_t         m_coeffFracBits[6];
+  int8_t          m_goRicePar;
+  int8_t          m_goRiceZero;
+  int8_t          m_stateId;
+  uint32_t*       m_sigFracBitsArray[12];
+  int32_t*        m_gtxFracBitsArray[21];
+  common_context* m_commonCtx;
+
+  unsigned        effWidth;
+  unsigned        effHeight;
+} depquant_state;
+typedef struct {
+  int64_t  ALIGNED(32) m_rdCost[12];
+  uint16_t ALIGNED(32) m_absLevelsAndCtxInit
+    [12][24]; // 16x8bit for abs levels + 16x16bit for ctx init id
+  int8_t          ALIGNED(16) m_numSigSbb[12];
+  int             ALIGNED(32) m_remRegBins[12];
+  int8_t          ALIGNED(16) m_refSbbCtxId[12];
+  uint32_t        ALIGNED(32) m_sbbFracBits[12][2];
+  uint32_t        ALIGNED(32) m_sigFracBits[12][2];
+  int32_t         ALIGNED(32) m_coeffFracBits[12][6];
+  int8_t          ALIGNED(16) m_goRicePar[12];
+  int8_t          ALIGNED(16) m_goRiceZero[12];
+  int8_t          ALIGNED(16) m_stateId[12];
+  uint32_t        ALIGNED(32) m_sigFracBitsArray[12][12][2];
+  int32_t         ALIGNED(32) m_gtxFracBitsArray[21][6];
+  common_context* m_commonCtx;
+
+  unsigned        effWidth;
+  unsigned        effHeight;
+
+  bool            all_gte_four;
+  bool            all_lt_four;
+} all_depquant_states;
+
+typedef struct {
+  common_context      m_common_context;
+  all_depquant_states m_allStates;
+  int                 m_curr_state_offset;
+  int                 m_prev_state_offset;
+  int                 m_skip_state_offset;
+  depquant_state      m_startState;
+  quant_block*        m_quant;
+  Decision            m_trellis[TR_MAX_WIDTH * TR_MAX_WIDTH];
+} context_store;
+
+
 int uvg_init_nb_info(encoder_control_t* encoder);
 void uvg_dealloc_nb_info(encoder_control_t* encoder);
 
@@ -122,4 +209,40 @@ int uvg_dep_quant(
   enum uvg_tree_type tree_type,
   int* absSum,
   const bool enableScalingLists);
+
+
+void uvg_dep_quant_update_state(
+  context_store*  ctxs,
+  int             numIPos,
+  const uint32_t  scan_pos,
+  const Decision* decisions,
+  const uint32_t  sigCtxOffsetNext,
+  const uint32_t  gtxCtxOffsetNext,
+  const NbInfoSbb next_nb_info_ssb,
+  const int       baseLevel,
+  const bool      extRiceFlag,
+  int             decision_id);
+
+
+void uvg_dep_quant_update_state_eos(
+  context_store*  ctxs,
+  const uint32_t  scan_pos,
+  const uint32_t  cg_pos,
+  const uint32_t  sigCtxOffsetNext,
+  const uint32_t  gtxCtxOffsetNext,
+  const uint32_t  width_in_sbb,
+  const uint32_t  height_in_sbb,
+  const uint32_t  next_sbb_right,
+  const uint32_t  next_sbb_below,
+  const Decision* decisions,
+  int             decision_id);
+
+void uvg_dep_quant_check_rd_costs(
+  const all_depquant_states* const state,
+  const enum ScanPosType           spt,
+  const PQData*                    pqDataA,
+  Decision*                        decisions,
+  const int                        decisionA,
+  const int                        decisionB,
+  const int                        state_offset);
 #endif
diff --git a/src/strategies/avx2/depquant-avx2.c b/src/strategies/avx2/depquant-avx2.c
new file mode 100644
index 00000000..86056de4
--- /dev/null
+++ b/src/strategies/avx2/depquant-avx2.c
@@ -0,0 +1,1389 @@
+/*****************************************************************************
+ * This file is part of uvg266 VVC encoder.
+ *
+ * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ * 
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ * 
+ * * Redistributions in binary form must reproduce the above copyright notice, this
+ *   list of conditions and the following disclaimer in the documentation and/or
+ *   other materials provided with the distribution.
+ * 
+ * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
+ ****************************************************************************/
+
+/*
+* \file
+*/
+
+#include "strategies/avx2/depquant-avx2.h"
+
+#if COMPILE_INTEL_AVX2 && defined X86_64
+#include "dep_quant.h"
+
+#include <immintrin.h>
+#include "cu.h"
+#include "encoderstate.h"
+#include "intra.h"
+#include "rdo.h"
+#include "transform.h"
+#include "generic/quant-generic.h"
+#include "uvg_math.h"
+static const int32_t g_goRiceBits[4][RICEMAX] = {
+    { 32768,  65536,  98304, 131072, 163840, 196608, 262144, 262144, 327680, 327680, 327680, 327680, 393216, 393216, 393216, 393216, 393216, 393216, 393216, 393216, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752},
+    { 65536,  65536,  98304,  98304, 131072, 131072, 163840, 163840, 196608, 196608, 229376, 229376, 294912, 294912, 294912, 294912, 360448, 360448, 360448, 360448, 360448, 360448, 360448, 360448, 425984, 425984, 425984, 425984, 425984, 425984, 425984, 425984},
+    { 98304,  98304,  98304,  98304, 131072, 131072, 131072, 131072, 163840, 163840, 163840, 163840, 196608, 196608, 196608, 196608, 229376, 229376, 229376, 229376, 262144, 262144, 262144, 262144, 327680, 327680, 327680, 327680, 327680, 327680, 327680, 327680},
+    {131072, 131072, 131072, 131072, 131072, 131072, 131072, 131072, 163840, 163840, 163840, 163840, 163840, 163840, 163840, 163840, 196608, 196608, 196608, 196608, 196608, 196608, 196608, 196608, 229376, 229376, 229376, 229376, 229376, 229376, 229376, 229376},
+};
+
+static const int g_riceT[4] = { 32,128, 512, 2048 };
+static const int g_riceShift[5] = { 0, 2, 4, 6, 8 };
+
+static const uint32_t g_goRiceParsCoeff[32] = { 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 2,
+                                         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3 };
+
+static void check_rd_costs_avx2(const all_depquant_states* const state, const enum ScanPosType spt, const PQData* pqDataA, Decision* decisions, int start)
+{
+  int64_t temp_rd_cost_a[4] = {0, 0, 0, 0};
+  int64_t temp_rd_cost_b[4] = {0, 0, 0, 0};
+  int64_t temp_rd_cost_z[4] = {0, 0, 0, 0};
+
+  __m256i pq_a_delta_dist = _mm256_setr_epi64x(pqDataA->deltaDist[0], pqDataA->deltaDist[0], pqDataA->deltaDist[3], pqDataA->deltaDist[3]);
+  __m256i pq_b_delta_dist = _mm256_setr_epi64x(pqDataA->deltaDist[2], pqDataA->deltaDist[2], pqDataA->deltaDist[1], pqDataA->deltaDist[1]);
+
+  __m256i rd_cost_a = _mm256_load_si256((__m256i const*)&state->m_rdCost[start]);
+  __m256i rd_cost_b = rd_cost_a;
+  __m256i rd_cost_z = rd_cost_a;
+
+  rd_cost_a = _mm256_add_epi64(rd_cost_a, pq_a_delta_dist);
+  rd_cost_b = _mm256_add_epi64(rd_cost_b, pq_b_delta_dist);
+
+
+  if (state->all_gte_four) {
+    if (pqDataA->absLevel[0] < 4 && pqDataA->absLevel[3] < 4) {
+      __m128i offsets = _mm_set_epi32(18 + pqDataA->absLevel[3], 12 + pqDataA->absLevel[3], 6 + pqDataA->absLevel[0], 0 + pqDataA->absLevel[0]);
+      __m128i coeff_frac_bits = _mm_i32gather_epi32(&state->m_coeffFracBits[start][0], offsets, 4);
+      __m256i ext_frac_bits = _mm256_cvtepi32_epi64(coeff_frac_bits);
+      rd_cost_a = _mm256_add_epi64(rd_cost_a, ext_frac_bits);
+    } else if (pqDataA->absLevel[0] >= 4 && pqDataA->absLevel[3] >= 4) {
+      __m128i value = _mm_set_epi32((pqDataA->absLevel[3] - 4) >> 1, (pqDataA->absLevel[3] - 4) >> 1, (pqDataA->absLevel[0] - 4) >> 1, (pqDataA->absLevel[0] - 4) >> 1);
+
+      __m128i offsets = _mm_set_epi32(18 + pqDataA->absLevel[3], 12 + pqDataA->absLevel[3], 6 + pqDataA->absLevel[0], 0 + pqDataA->absLevel[0]);
+      __m128i t = _mm_slli_epi32(value, 1);
+      offsets = _mm_sub_epi32(offsets, t);
+      __m128i coeff_frac_bits = _mm_i32gather_epi32(state->m_coeffFracBits[start], offsets, 4);
+
+      __m128i max_rice = _mm_set1_epi32(31);
+      value = _mm_min_epi32(value, max_rice);
+      __m128i go_rice_tab = _mm_cvtepi8_epi32(_mm_loadu_si32(&state->m_goRicePar[start]));
+      go_rice_tab = _mm_slli_epi32(go_rice_tab, 5);
+      value = _mm_add_epi32(value, go_rice_tab);
+
+      __m128i temp = _mm_add_epi32(coeff_frac_bits, _mm_i32gather_epi32(&g_goRiceBits[0][0], value, 4));
+      rd_cost_a = _mm256_add_epi64(rd_cost_a, _mm256_cvtepi32_epi64(temp));
+    } else {
+      const int pqAs[4] = {0, 0, 3, 3};
+      ALIGNED(32) int64_t rd_costs[4] = {0, 0, 0, 0}; 
+      for (int i = 0; i < 4; i++) {
+        const int      state_offset = start + i;
+        const int      pqA = pqAs[i];
+        const int32_t* goRiceTab = g_goRiceBits[state->m_goRicePar[state_offset]];
+        if (pqDataA->absLevel[pqA] < 4) {
+          rd_costs[i] = state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA]];
+        } else {
+          const coeff_t value = (pqDataA->absLevel[pqA] - 4) >> 1;
+          rd_costs[i] += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA] - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1];
+        }
+      }
+      rd_cost_a = _mm256_add_epi64(rd_cost_a, _mm256_loadu_si256((__m256i const *)&rd_costs[0]));
+    }
+
+    if (pqDataA->absLevel[1] < 4 && pqDataA->absLevel[2] < 4) {
+      __m128i offsets = _mm_set_epi32(18 + pqDataA->absLevel[1], 12 + pqDataA->absLevel[1], 6 + pqDataA->absLevel[2], 0 + pqDataA->absLevel[2]);
+      __m128i coeff_frac_bits = _mm_i32gather_epi32(state->m_coeffFracBits[start], offsets, 4);
+      __m256i ext_frac_bits = _mm256_cvtepi32_epi64(coeff_frac_bits);
+      rd_cost_b = _mm256_add_epi64(rd_cost_b, ext_frac_bits);
+    } else if (pqDataA->absLevel[1] >= 4 && pqDataA->absLevel[2] >= 4) {
+      __m128i value = _mm_set_epi32((pqDataA->absLevel[1] - 4) >> 1, (pqDataA->absLevel[1] - 4) >> 1, (pqDataA->absLevel[2] - 4) >> 1, (pqDataA->absLevel[2] - 4) >> 1);
+
+      __m128i offsets = _mm_set_epi32(18 + pqDataA->absLevel[1], 12 + pqDataA->absLevel[1], 6 + pqDataA->absLevel[2], 0 + pqDataA->absLevel[2]);
+      __m128i t = _mm_slli_epi32(value, 1);
+      offsets = _mm_sub_epi32(offsets, t);
+      __m128i coeff_frac_bits = _mm_i32gather_epi32(state->m_coeffFracBits[start], offsets, 4);
+
+      __m128i max_rice = _mm_set1_epi32(31);
+      value = _mm_min_epi32(value, max_rice);
+      __m128i go_rice_tab = _mm_cvtepi8_epi32(_mm_loadu_si32(&state->m_goRicePar[start]));
+      go_rice_tab = _mm_slli_epi32(go_rice_tab, 5);
+      value = _mm_add_epi32(value, go_rice_tab);
+
+      __m128i temp = _mm_add_epi32(coeff_frac_bits, _mm_i32gather_epi32(&g_goRiceBits[0][0], value, 4));
+      rd_cost_b = _mm256_add_epi64(rd_cost_b, _mm256_cvtepi32_epi64(temp));
+    } else {
+      const int pqBs[4] = {2, 2, 1, 1};
+      int64_t rd_costs[4] = {0, 0, 0, 0}; 
+      for (int i = 0; i < 4; i++) {
+        const int      state_offset = start + i;
+        const int      pqB = pqBs[i];
+        const int32_t* goRiceTab = g_goRiceBits[state->m_goRicePar[state_offset]];
+        if (pqDataA->absLevel[pqB] < 4) {
+          rd_costs[i] = state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqB]];
+        } else {
+          const coeff_t value = (pqDataA->absLevel[pqB] - 4) >> 1;
+          rd_costs[i] += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqB] - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1];
+        }
+      }
+      rd_cost_b =
+        _mm256_add_epi64(rd_cost_b, _mm256_loadu_si256((__m256i const *) & rd_costs[0]));
+    }
+
+    if (spt == SCAN_ISCSBB) {
+      __m256i original = _mm256_loadu_si256((__m256i const*)state->m_sigFracBits[start]);
+      __m256i even_mask = _mm256_setr_epi32(0, 2, 4, 6, -1, -1, -1, -1);
+      __m256i odd_mask = _mm256_setr_epi32(1, 3, 5, 7, -1, -1, -1, -1);
+      __m256i even = _mm256_permutevar8x32_epi32(original, even_mask);
+      __m256i odd = _mm256_permutevar8x32_epi32(original, odd_mask);
+      __m256i even_64 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(even, 0));
+      __m256i odd_64 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(odd, 0));
+      rd_cost_a = _mm256_add_epi64(rd_cost_a, odd_64);
+      rd_cost_b = _mm256_add_epi64(rd_cost_b, odd_64);
+      rd_cost_z = _mm256_add_epi64(rd_cost_z, even_64);
+    } else if (spt == SCAN_SOCSBB) {
+      __m256i original = _mm256_loadu_si256((__m256i const*)state->m_sigFracBits[start]);
+      __m256i even_mask = _mm256_setr_epi32(0, 2, 4, 6, -1, -1, -1, -1);
+      __m256i odd_mask = _mm256_setr_epi32(1, 3, 5, 7, -1, -1, -1, -1);
+      __m256i even = _mm256_permutevar8x32_epi32(original, even_mask);
+      __m256i odd = _mm256_permutevar8x32_epi32(original, odd_mask);
+      __m256i m_sigFracBits_0 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(even, 0));
+      __m256i m_sigFracBits_1 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(odd, 0));
+
+      original = _mm256_loadu_si256((__m256i const*)state->m_sbbFracBits[start]);
+      odd = _mm256_permutevar8x32_epi32(original, odd_mask);
+      __m256i m_sbbFracBits_1 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(odd, 0));
+
+      
+      rd_cost_a = _mm256_add_epi64(rd_cost_a, m_sbbFracBits_1);
+      rd_cost_b = _mm256_add_epi64(rd_cost_b, m_sbbFracBits_1);
+      rd_cost_z = _mm256_add_epi64(rd_cost_z, m_sbbFracBits_1);
+
+      rd_cost_a = _mm256_add_epi64(rd_cost_a, m_sigFracBits_1);
+      rd_cost_b = _mm256_add_epi64(rd_cost_b, m_sigFracBits_1);
+      rd_cost_z = _mm256_add_epi64(rd_cost_z, m_sigFracBits_0);
+    }
+    else {
+      if (state->m_numSigSbb[start] && state->m_numSigSbb[start + 1] && state->m_numSigSbb[start + 2] && state->m_numSigSbb[start + 3]) {
+        __m256i original = _mm256_loadu_si256((__m256i const*)state->m_sigFracBits[start]);
+        __m256i even_mask = _mm256_setr_epi32(0, 2, 4, 6, -1, -1, -1, -1);
+        __m256i odd_mask = _mm256_setr_epi32(1, 3, 5, 7, -1, -1, -1, -1);
+        __m256i even = _mm256_permutevar8x32_epi32(original, even_mask);
+        __m256i odd = _mm256_permutevar8x32_epi32(original, odd_mask);
+        __m256i even_64 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(even, 0));
+        __m256i odd_64 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(odd, 0));
+        rd_cost_a = _mm256_add_epi64(rd_cost_a, odd_64);
+        rd_cost_b = _mm256_add_epi64(rd_cost_b, odd_64);
+        rd_cost_z = _mm256_add_epi64(rd_cost_z, even_64);     
+      }
+      else if (!state->m_numSigSbb[start] && !state->m_numSigSbb[start + 1] && !state->m_numSigSbb[start + 2] && !state->m_numSigSbb[start + 3]) {
+        rd_cost_z = _mm256_setr_epi64x(decisions->rdCost[0], decisions->rdCost[0], decisions->rdCost[3], decisions->rdCost[3]);
+      }
+
+      else {
+        const int ALIGNED(32) pqAs[4] = {0, 0, 3, 3};
+        _mm256_store_si256((__m256i*)temp_rd_cost_a, rd_cost_a);
+        _mm256_store_si256((__m256i*)temp_rd_cost_b, rd_cost_b);
+        _mm256_store_si256((__m256i*)temp_rd_cost_z, rd_cost_z);
+        for (int i = 0; i < 4; i++) {
+          const int state_offset = start + i;
+          if (state->m_numSigSbb[state_offset]) {
+            temp_rd_cost_a[i] += state->m_sigFracBits[state_offset][1];
+            temp_rd_cost_b[i] += state->m_sigFracBits[state_offset][1];
+            temp_rd_cost_z[i] += state->m_sigFracBits[state_offset][0];
+          } else {
+            temp_rd_cost_z[i] = decisions->rdCost[pqAs[i]];
+          }
+        }
+        rd_cost_a = _mm256_loadu_si256((__m256i*)temp_rd_cost_a);
+        rd_cost_b = _mm256_loadu_si256((__m256i*)temp_rd_cost_b);
+        rd_cost_z = _mm256_loadu_si256((__m256i*)temp_rd_cost_z);
+      }
+    }
+  } else if (state->all_lt_four) {
+    __m128i scale_bits = _mm_set1_epi32(1 << SCALE_BITS);
+    __m128i max_rice = _mm_set1_epi32(31);
+    __m128i go_rice_zero = _mm_cvtepi8_epi32(_mm_loadu_si128((const __m128i*)&state->m_goRiceZero[start]));
+    // RD cost A
+    {
+      __m128i pq_abs_a = _mm_set_epi32(pqDataA->absLevel[3], pqDataA->absLevel[3], pqDataA->absLevel[0], pqDataA->absLevel[0]);
+      __m128i cmp = _mm_cmpgt_epi32(pq_abs_a, go_rice_zero);
+      
+      __m128i go_rice_smaller = _mm_min_epi32(pq_abs_a, max_rice);
+
+      __m128i other = _mm_sub_epi32(pq_abs_a, _mm_set1_epi32(1));
+
+      __m128i selected = _mm_blendv_epi8(other, go_rice_smaller, cmp);
+
+
+      __m128i go_rice_offset = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)&state->m_goRicePar[start]));
+      go_rice_offset = _mm_slli_epi32(go_rice_offset, 5);
+
+      __m128i offsets = _mm_add_epi32(selected, go_rice_offset);
+      __m128i go_rice_tab = _mm_i32gather_epi32(&g_goRiceBits[0][0], offsets, 4);
+      __m128i temp = _mm_add_epi32(go_rice_tab, scale_bits);
+
+      rd_cost_a = _mm256_add_epi64(rd_cost_a, _mm256_cvtepi32_epi64(temp));
+    }
+    // RD cost b
+    {
+      __m128i pq_abs_b = _mm_set_epi32(pqDataA->absLevel[1], pqDataA->absLevel[1], pqDataA->absLevel[2], pqDataA->absLevel[2]);
+      __m128i cmp = _mm_cmpgt_epi32(pq_abs_b, go_rice_zero);
+
+      __m128i go_rice_smaller = _mm_min_epi32(pq_abs_b, max_rice);
+
+      __m128i other = _mm_sub_epi32(pq_abs_b, _mm_set1_epi32(1));
+
+      __m128i selected = _mm_blendv_epi8(other, go_rice_smaller, cmp);
+
+
+      __m128i go_rice_offset = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)&state->m_goRicePar[start]));
+      go_rice_offset = _mm_slli_epi32(go_rice_offset, 5);
+
+      __m128i offsets = _mm_add_epi32(selected, go_rice_offset);
+      __m128i go_rice_tab = _mm_i32gather_epi32(&g_goRiceBits[0][0], offsets, 4);
+      __m128i temp = _mm_add_epi32(go_rice_tab, scale_bits);
+
+      rd_cost_b = _mm256_add_epi64(rd_cost_b, _mm256_cvtepi32_epi64(temp));
+    }
+    // RD cost Z
+    {
+      __m128i go_rice_offset = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)&state->m_goRicePar[start]));
+      go_rice_offset = _mm_slli_epi32(go_rice_offset, 5);
+
+      go_rice_offset = _mm_add_epi32(go_rice_offset, go_rice_zero);
+      __m128i go_rice_tab = _mm_i32gather_epi32(&g_goRiceBits[0][0], go_rice_offset, 4);
+      rd_cost_z = _mm256_add_epi64(rd_cost_z, _mm256_cvtepi32_epi64(go_rice_tab));
+    }
+  } else {
+    const int pqAs[4] = {0, 0, 3, 3};
+    const int pqBs[4] = {2, 2, 1, 1};
+    const int decision_a[4] = {0, 2, 1, 3};
+    for (int i = 0; i < 4; i++) {
+      const int      state_offset = start + i;
+      const int32_t* goRiceTab = g_goRiceBits[state->m_goRicePar[state_offset]];
+      const int pqA = pqAs[i];
+      const int pqB = pqBs[i];
+      int64_t rdCostA = state->m_rdCost[state_offset] + pqDataA->deltaDist[pqA];
+      int64_t rdCostB = state->m_rdCost[state_offset] + pqDataA->deltaDist[pqB];
+      int64_t rdCostZ = state->m_rdCost[state_offset];
+      if (state->m_remRegBins[state_offset] >= 4) {
+        if (pqDataA->absLevel[pqA] < 4) {
+          rdCostA += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA]];
+        } else {
+          const coeff_t value = (pqDataA->absLevel[pqA] - 4) >> 1;
+          rdCostA += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA] - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1];
+        }
+        if (pqDataA->absLevel[pqB] < 4) {
+          rdCostB += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqB]];
+        } else {
+          const coeff_t value = (pqDataA->absLevel[pqB] - 4) >> 1;
+          rdCostB += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqB] - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1];
+        }
+        if (spt == SCAN_ISCSBB) {
+          rdCostA += state->m_sigFracBits[state_offset][1];
+          rdCostB += state->m_sigFracBits[state_offset][1];
+          rdCostZ += state->m_sigFracBits[state_offset][0];
+        } else if (spt == SCAN_SOCSBB) {
+          rdCostA += state->m_sbbFracBits[state_offset][1] + state->m_sigFracBits[state_offset][1];
+          rdCostB += state->m_sbbFracBits[state_offset][1] + state->m_sigFracBits[state_offset][1];
+          rdCostZ += state->m_sbbFracBits[state_offset][1] + state->m_sigFracBits[state_offset][0];
+        } else if (state->m_numSigSbb[state_offset]) {
+          rdCostA += state->m_sigFracBits[state_offset][1];
+          rdCostB += state->m_sigFracBits[state_offset][1];
+          rdCostZ += state->m_sigFracBits[state_offset][0];
+        } else {
+          rdCostZ = decisions->rdCost[decision_a[i]];
+        }
+      } else {
+        rdCostA += (1 << SCALE_BITS) + goRiceTab[pqDataA->absLevel[pqA] <= state->m_goRiceZero[state_offset] ? pqDataA->absLevel[pqA] - 1 : (pqDataA->absLevel[pqA] < RICEMAX ? pqDataA->absLevel[pqA] : RICEMAX - 1)];
+        rdCostB += (1 << SCALE_BITS) + goRiceTab[pqDataA->absLevel[pqB] <= state->m_goRiceZero[state_offset] ? pqDataA->absLevel[pqB] - 1 : (pqDataA->absLevel[pqB] < RICEMAX ? pqDataA->absLevel[pqB] : RICEMAX - 1)];
+        rdCostZ += goRiceTab[state->m_goRiceZero[state_offset]];
+      }
+      temp_rd_cost_a[i] = rdCostA;
+      temp_rd_cost_b[i] = rdCostB;
+      temp_rd_cost_z[i] = rdCostZ;
+    }
+    rd_cost_a = _mm256_loadu_si256((__m256i*)temp_rd_cost_a);
+    rd_cost_b = _mm256_loadu_si256((__m256i*)temp_rd_cost_b);
+    rd_cost_z = _mm256_loadu_si256((__m256i*)temp_rd_cost_z);
+  }
+  rd_cost_a = _mm256_permute4x64_epi64(rd_cost_a, 216);
+  rd_cost_b = _mm256_permute4x64_epi64(rd_cost_b, 141);
+  rd_cost_z = _mm256_permute4x64_epi64(rd_cost_z, 216);
+  __m256i rd_cost_decision = _mm256_load_si256((__m256i*)decisions->rdCost);
+
+  __m256i decision_abs_coeff = _mm256_load_si256((__m256i*)decisions->absLevel);
+  __m256i decision_prev_state = _mm256_load_si256((__m256i*)decisions->prevId);
+  __m256i decision_data = _mm256_permute2x128_si256(decision_abs_coeff, decision_prev_state, 0x20);
+  __m256i mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
+  decision_data = _mm256_permutevar8x32_epi32(decision_data, mask);
+
+  __m256i a_data = _mm256_set_epi32(3, pqDataA->absLevel[3], 1, pqDataA->absLevel[0], 2, pqDataA->absLevel[3], 0, pqDataA->absLevel[0]);
+  __m256i b_data = _mm256_set_epi32(2, pqDataA->absLevel[1], 0, pqDataA->absLevel[2], 3, pqDataA->absLevel[1], 1, pqDataA->absLevel[2]);
+  __m256i z_data = _mm256_set_epi32(3, 0, 1, 0, 2, 0, 0, 0);
+
+  __m256i a_vs_b = _mm256_cmpgt_epi64(rd_cost_a, rd_cost_b);
+  __m256i cheaper_first = _mm256_blendv_epi8(rd_cost_a, rd_cost_b, a_vs_b);
+  __m256i cheaper_first_data = _mm256_blendv_epi8(a_data, b_data, a_vs_b);
+
+  __m256i z_vs_decision = _mm256_cmpgt_epi64(rd_cost_z, rd_cost_decision);
+  __m256i cheaper_second = _mm256_blendv_epi8(rd_cost_z, rd_cost_decision, z_vs_decision);
+  __m256i cheaper_second_data = _mm256_blendv_epi8(z_data, decision_data, z_vs_decision);
+
+  __m256i final_decision = _mm256_cmpgt_epi64(cheaper_first, cheaper_second);
+  __m256i final_rd_cost = _mm256_blendv_epi8(cheaper_first, cheaper_second, final_decision);
+  __m256i final_data = _mm256_blendv_epi8(cheaper_first_data, cheaper_second_data, final_decision);
+
+  _mm256_store_si256((__m256i*)decisions->rdCost, final_rd_cost);
+  final_data = _mm256_permutevar8x32_epi32(final_data, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
+  _mm256_storeu2_m128i((__m128i *)decisions->prevId, (__m128i *)decisions->absLevel, final_data);
+}
+
+
+static INLINE void checkRdCostSkipSbbZeroOut(
+  Decision*                        decision,
+  const all_depquant_states* const state,
+  int                              decision_id,
+  int                              skip_offset)
+{
+  int64_t rdCost = state->m_rdCost[decision_id + skip_offset] + state->m_sbbFracBits[decision_id + skip_offset][0];
+  decision->rdCost[decision_id] = rdCost;
+  decision->absLevel[decision_id] = 0;
+  decision->prevId[decision_id] = 4 + state->m_stateId[decision_id + skip_offset];
+}
+
+
+static INLINE void checkRdCostSkipSbb(const all_depquant_states* const state, Decision * decisions, int decision_id, int skip_offset)
+{
+  int64_t rdCost = state->m_rdCost[skip_offset + decision_id] + state->m_sbbFracBits[skip_offset + decision_id][0];
+  if (rdCost < decisions->rdCost[decision_id])
+  {
+    decisions->rdCost[decision_id] = rdCost;
+    decisions->absLevel[decision_id] = 0;
+    decisions->prevId[decision_id] = 4 + state->m_stateId[skip_offset + decision_id];
+  }
+}
+
+static INLINE void checkRdCostStart(const depquant_state* const state, int32_t lastOffset, const PQData *pqData, Decision *decisions, int
+                                    decision_id)
+{
+  int64_t rdCost = pqData->deltaDist[decision_id] + lastOffset;
+  if (pqData->absLevel[decision_id] < 4) {
+    rdCost += state->m_coeffFracBits[pqData->absLevel[decision_id]];
+  }
+  else {
+    const coeff_t value = (pqData->absLevel[decision_id] - 4) >> 1;
+    rdCost += state->m_coeffFracBits[pqData->absLevel[decision_id] - (value << 1)]
+              + g_goRiceBits[state->m_goRicePar][value < RICEMAX ? value : RICEMAX - 1];
+  }
+  if (rdCost < decisions->rdCost[decision_id]) {
+    decisions->rdCost[decision_id] = rdCost;
+    decisions->absLevel[decision_id] = pqData->absLevel[decision_id];
+    decisions->prevId[decision_id] = -1;
+  }
+}
+
+static INLINE void preQuantCoeff(const quant_block * const qp, const coeff_t absCoeff, PQData* pqData, coeff_t quanCoeff)
+{
+  int64_t scaledOrg = (int64_t)(absCoeff) * quanCoeff;
+  coeff_t  qIdx = MAX(1, (coeff_t)MIN(qp->m_maxQIdx, ((scaledOrg + qp->m_QAdd) >> qp->m_QShift)));
+  int64_t scaledAdd = qIdx * qp->m_DistStepAdd - scaledOrg * qp->m_DistOrgFact;
+  int index = qIdx & 3;
+  pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
+  pqData->absLevel[index] = (++qIdx) >> 1;
+  scaledAdd += qp->m_DistStepAdd;
+  index = qIdx & 3;
+  pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
+  pqData->absLevel[index] = (++qIdx) >> 1;
+  scaledAdd += qp->m_DistStepAdd;
+  index = qIdx & 3;
+  pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
+  pqData->absLevel[index] = (++qIdx) >> 1;
+  scaledAdd += qp->m_DistStepAdd;
+  index = qIdx & 3;
+  pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
+  pqData->absLevel[index] = (++qIdx) >> 1;
+}
+
+
+static const Decision startDec = { .rdCost = {INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2},
+  .absLevel = {-1, -1, -1, -1, 0, 0, 0, 0}, .prevId = {-2, -2, -2, -2, 4, 5, 6, 7} };
+
+
+static void xDecide(
+  all_depquant_states* const all_states,
+  depquant_state* const      m_startState,
+  quant_block *              qp,
+  const enum ScanPosType     spt,
+  const coeff_t              absCoeff,
+  const int                  lastOffset,
+  Decision*                  decisions,
+  bool                       zeroOut,
+  coeff_t                    quanCoeff,
+  const int                  skip_offset,
+  const int                  prev_offset)
+{
+  memcpy(decisions, &startDec, sizeof(Decision));
+
+  if (zeroOut) {
+    if (spt == SCAN_EOCSBB) {
+      checkRdCostSkipSbbZeroOut(decisions, all_states, 0, skip_offset);
+      checkRdCostSkipSbbZeroOut(decisions, all_states, 1, skip_offset);
+      checkRdCostSkipSbbZeroOut(decisions, all_states, 2, skip_offset);
+      checkRdCostSkipSbbZeroOut(decisions, all_states, 3, skip_offset);
+    }
+    return;
+  }
+
+  PQData pqData;
+  preQuantCoeff(qp, absCoeff, &pqData, quanCoeff);
+  check_rd_costs_avx2(all_states, spt, &pqData, decisions, prev_offset);
+  //uvg_dep_quant_check_rd_costs(all_states, spt, &pqData, decisions, 0, 2, prev_offset + 0);
+  //uvg_dep_quant_check_rd_costs(all_states, spt, &pqData, decisions, 2, 0, prev_offset + 1);
+  //uvg_dep_quant_check_rd_costs(all_states, spt, &pqData, decisions, 1, 3, prev_offset + 2);
+  //uvg_dep_quant_check_rd_costs(all_states, spt, &pqData, decisions, 3, 1, prev_offset + 3);
+  if (spt == SCAN_EOCSBB) {
+    checkRdCostSkipSbb(all_states, decisions, 0, skip_offset);
+    checkRdCostSkipSbb(all_states, decisions, 1, skip_offset);
+    checkRdCostSkipSbb(all_states, decisions, 2, skip_offset);
+    checkRdCostSkipSbb(all_states, decisions, 3, skip_offset);
+  }
+
+  checkRdCostStart(m_startState, lastOffset, &pqData, decisions, 0);
+  checkRdCostStart(m_startState, lastOffset, &pqData, decisions, 2);
+}
+
+
+static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos, const uint32_t cg_pos,
+                                  const uint32_t sigCtxOffsetNext, const uint32_t gtxCtxOffsetNext,
+                                  const uint32_t width_in_sbb, const uint32_t height_in_sbb,
+                                  const uint32_t next_sbb_right, const uint32_t next_sbb_below,
+                                  const Decision* decisions)
+{
+  all_depquant_states* state = &ctxs->m_allStates;
+  bool all_above_minus_two = true;
+  bool all_between_zero_and_three = true;
+  bool all_above_four = true;
+
+  
+  int state_offset = ctxs->m_curr_state_offset;
+  __m256i rd_cost = _mm256_load_si256((__m256i const*)decisions->rdCost);
+  _mm256_store_si256((__m256i *)& ctxs->m_allStates.m_rdCost[state_offset], rd_cost);
+  for (int i = 0; i < 4; ++i) {
+    all_above_minus_two &= decisions->prevId[i] > -2;
+    all_between_zero_and_three &= decisions->prevId[i] >= 0 && decisions->prevId[i] < 4;
+    all_above_four &= decisions->prevId[i] >= 4;
+  }
+  if (all_above_minus_two) {
+    bool all_have_previous_state = true;
+    __m128i prev_state;
+    __m128i prev_state_no_offset;
+    __m128i abs_level = _mm_load_si128((const __m128i*)decisions->absLevel);
+    if (all_above_four) {
+      prev_state = _mm_set1_epi32(ctxs->m_skip_state_offset);
+      prev_state_no_offset = _mm_sub_epi32(_mm_load_si128((const __m128i*)decisions->prevId), _mm_set1_epi32(4));
+      prev_state = _mm_add_epi32(
+        prev_state,
+            prev_state_no_offset
+      );
+      memset(&state->m_numSigSbb[state_offset], 0, 4);
+      for (int i = 0; i < 4; ++i) {
+        memset(state->m_absLevelsAndCtxInit[state_offset + i], 0, 16 * sizeof(uint8_t));    
+      }
+    } else if (all_between_zero_and_three) {
+      prev_state_no_offset = _mm_set1_epi32(ctxs->m_prev_state_offset);
+      prev_state = _mm_add_epi32(
+        prev_state_no_offset,
+        _mm_load_si128((const __m128i*)decisions->prevId)
+      );
+      __m128i control = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+      __m128i prev_state_with_ff_high_bytes = _mm_or_si128(prev_state, _mm_set1_epi32(0xffffff00));
+      __m128i num_sig_sbb = _mm_load_si128((const __m128i*)state->m_numSigSbb);
+      num_sig_sbb = _mm_shuffle_epi8(num_sig_sbb, prev_state_with_ff_high_bytes);
+      num_sig_sbb = _mm_add_epi32(
+        num_sig_sbb,
+        _mm_min_epi32(abs_level, _mm_set1_epi32(1))
+      );
+
+      num_sig_sbb = _mm_shuffle_epi8(num_sig_sbb, control);
+      int num_sig_sbb_s = _mm_extract_epi32(num_sig_sbb, 0);
+      memcpy(&state->m_numSigSbb[state_offset], &num_sig_sbb_s, 4);
+
+      int32_t prev_state_scalar[4];
+      _mm_storeu_si128((__m128i*)prev_state_scalar, prev_state);
+      for (int i = 0; i < 4; ++i) {
+        memcpy(state->m_absLevelsAndCtxInit[state_offset + i], state->m_absLevelsAndCtxInit[prev_state_scalar[i]], 16 * sizeof(uint8_t));
+      }
+    } else {
+      int prev_state_s[4] = {-1, -1, -1, -1};
+      for (int i = 0; i < 4; ++i) {
+        const int decision_id = i;
+        const int curr_state_offset = state_offset + i;
+        if (decisions->prevId[decision_id] >= 4) {
+          prev_state_s[i] = ctxs->m_skip_state_offset + (decisions->prevId[decision_id] - 4);
+          state->m_numSigSbb[curr_state_offset] = 0;
+          memset(state->m_absLevelsAndCtxInit[curr_state_offset], 0, 16 * sizeof(uint8_t));
+        } else if (decisions->prevId[decision_id] >= 0) {
+          prev_state_s[i] = ctxs->m_prev_state_offset + decisions->prevId[decision_id];
+          state->m_numSigSbb[curr_state_offset] = state->m_numSigSbb[prev_state_s[i]] + !!decisions->absLevel[decision_id];
+          memcpy(state->m_absLevelsAndCtxInit[curr_state_offset], state->m_absLevelsAndCtxInit[prev_state_s[i]], 16 * sizeof(uint8_t));
+        } else {
+          state->m_numSigSbb[curr_state_offset] = 1;
+          memset(state->m_absLevelsAndCtxInit[curr_state_offset], 0, 16 * sizeof(uint8_t));
+          all_have_previous_state = false;
+        }
+      }
+      prev_state = _mm_loadu_si128((__m128i const*)prev_state_s);
+    }
+    uint32_t level_offset = scan_pos & 15;
+    __m128i  max_abs = _mm_min_epi32(abs_level, _mm_set1_epi32(32));
+    uint32_t max_abs_s[4];
+    _mm_storeu_si128((__m128i*)max_abs_s, max_abs);
+    for (int i = 0; i < 4; ++i) {
+      uint8_t* levels = (uint8_t*)state->m_absLevelsAndCtxInit[state_offset + i];
+      levels[level_offset] = max_abs_s[i];
+    }
+
+    // Update common context
+    __m128i last;
+    {
+      const uint32_t numSbb = width_in_sbb * height_in_sbb;
+      common_context* cc = &ctxs->m_common_context;
+      size_t         setCpSize = cc->m_nbInfo[scan_pos - 1].maxDist * sizeof(uint8_t);
+      int previous_state_array[4];
+      _mm_storeu_si128((__m128i*)previous_state_array, prev_state);
+      for (int curr_state = 0; curr_state < 4; ++curr_state) {
+        uint8_t* sbbFlags = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset + (curr_state)].sbbFlags;
+        uint8_t* levels = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset + (curr_state)].levels;
+        const int p_state = previous_state_array[curr_state];
+        if (p_state != -1 && ctxs->m_allStates.m_refSbbCtxId[p_state] >= 0) {
+          const int prev_sbb = cc->m_prev_sbb_ctx_offset + ctxs->m_allStates.m_refSbbCtxId[p_state];
+          memcpy(sbbFlags, cc->m_allSbbCtx[prev_sbb].sbbFlags, numSbb * sizeof(uint8_t));
+          memcpy(levels + scan_pos, cc->m_allSbbCtx[prev_sbb].levels + scan_pos, setCpSize);
+        } else {
+          memset(sbbFlags, 0, numSbb * sizeof(uint8_t));
+          memset(levels + scan_pos, 0, setCpSize);
+        }
+        sbbFlags[cg_pos] = !!ctxs->m_allStates.m_numSigSbb[curr_state + state_offset];
+        memcpy(levels + scan_pos, ctxs->m_allStates.m_absLevelsAndCtxInit[curr_state + state_offset], 16 * sizeof(uint8_t));
+      }
+
+      __m128i sbb_offsets = _mm_set_epi32(3 * numSbb, 2 * numSbb, 1 * numSbb, 0);
+      __m128i next_sbb_right_m = _mm_set1_epi32(next_sbb_right);
+      __m128i sbb_offsets_right = _mm_add_epi32(sbb_offsets, next_sbb_right_m);
+      __m128i sbb_right = next_sbb_right ? _mm_i32gather_epi32((const int *)cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].sbbFlags, sbb_offsets_right, 1) : _mm_set1_epi32(0);
+
+      __m128i sbb_offsets_below = _mm_add_epi32(sbb_offsets, _mm_set1_epi32(next_sbb_below));
+      __m128i sbb_below = next_sbb_below ? _mm_i32gather_epi32((const int *)cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].sbbFlags, sbb_offsets_below, 1) : _mm_set1_epi32(0);
+
+      __m128i sig_sbb = _mm_or_si128(sbb_right, sbb_below);
+      sig_sbb         = _mm_and_si128(sig_sbb, _mm_set1_epi32(0xff));
+      sig_sbb = _mm_min_epi32(sig_sbb, _mm_set1_epi32(1));
+      __m256i sbb_frac_bits = _mm256_i32gather_epi64((int64_t *)cc->m_sbbFlagBits[0], sig_sbb, 8);
+      _mm256_store_si256((__m256i*)state->m_sbbFracBits[state_offset], sbb_frac_bits);
+
+      memset(&state->m_numSigSbb[state_offset], 0, 4);
+      memset(&state->m_goRicePar[state_offset], 0, 4);
+
+      uint8_t states[4] = {0, 1, 2, 3};
+      memcpy(&state->m_refSbbCtxId[state_offset], states, 4);
+      if (all_have_previous_state) {
+        __m128i rem_reg_bins = _mm_i32gather_epi32(state->m_remRegBins, prev_state, 4);
+        _mm_store_si128((__m128i*) & state->m_remRegBins[state_offset], rem_reg_bins);
+      } else {
+        const int temp = (state->effWidth * state->effHeight * 28) / 16;
+        for (int i = 0; i < 4; ++i) {
+          if (previous_state_array[i] != -1) {
+            state->m_remRegBins[i + state_offset] = state->m_remRegBins[previous_state_array[i]];
+          } else {
+            state->m_remRegBins[i + state_offset] = temp;
+          }
+        }
+      }
+      
+      const int        scanBeg = scan_pos - 16;
+      const NbInfoOut* nbOut = cc->m_nbInfo + scanBeg;
+      const uint8_t*   absLevels = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].levels + scanBeg;
+
+      __m128i          levels_offsets = _mm_set_epi32(cc->num_coeff * 3, cc->num_coeff * 2, cc->num_coeff * 1, 0);
+      __m128i          first_byte = _mm_set1_epi32(0xff);
+      __m128i          ones = _mm_set1_epi32(1);
+      __m128i         fours = _mm_set1_epi32(4);
+      __m256i          all[4];
+      uint64_t         temp[4];
+      const __m256i v_shuffle = _mm256_set_epi8(15, 14,  7,  6, 13, 12,  5,  4, 11, 10,  3,  2,  9,  8,  1,  0,
+                                                31, 30, 23, 22, 29, 28, 21, 20, 27, 26, 19, 18, 25, 24, 17, 16);
+
+      for (int id = 0; id < 16; id++, nbOut++) {
+        if (nbOut->num == 0) {
+          temp[id % 4] = 0;
+          if (id % 4 == 3) {
+            all[id / 4] = _mm256_loadu_si256((__m256i const*)temp);
+            all[id / 4] = _mm256_shuffle_epi8(all[id / 4], v_shuffle);
+          }
+          continue;
+        }
+        __m128i sum_abs = _mm_set1_epi32(0);
+        __m128i sum_abs_1 = _mm_set1_epi32(0);
+        __m128i sum_num = _mm_set1_epi32(0);
+        switch (nbOut->num) {
+        case 5:
+          {
+            __m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[4]));
+            __m128i t = _mm_i32gather_epi32((const int *)absLevels, offset, 1);
+            t = _mm_and_si128(t, first_byte);
+            sum_abs = _mm_add_epi32(sum_abs, t);
+            sum_num   = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
+            __m128i min_t = _mm_min_epi32(
+              t,
+              _mm_add_epi32(
+                fours,
+                _mm_and_si128(t, ones)
+              )
+            );
+            sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t);
+          }
+        case 4: {
+            __m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[3]));
+            __m128i t     = _mm_i32gather_epi32((const int*)absLevels, offset, 1);
+            t = _mm_and_si128(t, first_byte);
+            sum_abs = _mm_add_epi32(sum_abs, t);
+            sum_num   = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
+            __m128i min_t = _mm_min_epi32(
+              t,
+              _mm_add_epi32(
+                fours,
+                _mm_and_si128(t, ones)));
+            sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t);
+        }
+        case 3: {
+            __m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[2]));
+            __m128i t     = _mm_i32gather_epi32((const int*)absLevels, offset, 1);
+            t = _mm_and_si128(t, first_byte);
+            sum_abs = _mm_add_epi32(sum_abs, t);
+            sum_num   = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
+            __m128i min_t = _mm_min_epi32(
+              t,
+              _mm_add_epi32(
+                fours,
+                _mm_and_si128(t, ones)));
+            sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t);
+        }
+        case 2: {
+            __m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[1]));
+            __m128i t     = _mm_i32gather_epi32((const int*)absLevels, offset, 1);
+            t = _mm_and_si128(t, first_byte);
+            sum_abs = _mm_add_epi32(sum_abs, t);
+            sum_num   = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
+            __m128i min_t = _mm_min_epi32(
+              t,
+              _mm_add_epi32(
+                fours,
+                _mm_and_si128(t, ones)));
+            sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t);
+        }
+        case 1: {
+            __m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[0]));
+            __m128i t = _mm_i32gather_epi32((const int*)absLevels, offset, 1);
+            t = _mm_and_si128(t, first_byte);
+            sum_abs = _mm_add_epi32(sum_abs, t);
+            sum_num = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
+            __m128i min_t = _mm_min_epi32(
+              t,
+              _mm_add_epi32(
+                fours,
+                _mm_and_si128(t, ones)));
+            sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t);
+        }
+            break;
+        default:
+          assert(0);
+        }
+        sum_abs_1 = _mm_slli_epi32(sum_abs_1, 3);
+        sum_abs = _mm_slli_epi32(_mm_min_epi32(_mm_set1_epi32(127), sum_abs), 8);
+        __m128i template_ctx_init = _mm_add_epi32(sum_num, sum_abs);
+        template_ctx_init = _mm_add_epi32(template_ctx_init, sum_abs_1);
+        __m128i shuffle_mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 0, 0, 0, 0, 0, 0, 0, 0);
+        __m128i shuffled_template_ctx_init = _mm_shuffle_epi8(template_ctx_init, shuffle_mask);
+        temp[id % 4] = _mm_extract_epi64(shuffled_template_ctx_init, 0);
+        if (id % 4 == 3) {
+          all[id / 4] = _mm256_loadu_si256((__m256i const*)temp);
+          all[id / 4] = _mm256_shuffle_epi8(all[id / 4], v_shuffle);
+          last = template_ctx_init;
+        }
+      }
+
+      __m256i* v_src_tmp = all;
+
+      __m256i v_tmp[4];
+      v_tmp[0] = _mm256_permute2x128_si256(v_src_tmp[0], v_src_tmp[1], 0x20);
+      v_tmp[1] = _mm256_permute2x128_si256(v_src_tmp[0], v_src_tmp[1], 0x31);
+      v_tmp[2] = _mm256_permute2x128_si256(v_src_tmp[2], v_src_tmp[3], 0x20);
+      v_tmp[3] = _mm256_permute2x128_si256(v_src_tmp[2], v_src_tmp[3], 0x31);
+
+      __m256i v_tmp16_lo[2];
+      __m256i v_tmp16_hi[2];
+      v_tmp16_lo[0] = _mm256_unpacklo_epi32(v_tmp[0], v_tmp[1]);
+      v_tmp16_lo[1] = _mm256_unpacklo_epi32(v_tmp[2], v_tmp[3]);
+      v_tmp16_hi[0] = _mm256_unpackhi_epi32(v_tmp[0], v_tmp[1]);
+      v_tmp16_hi[1] = _mm256_unpackhi_epi32(v_tmp[2], v_tmp[3]);
+
+      v_tmp[0] = _mm256_permute4x64_epi64(v_tmp16_lo[0], _MM_SHUFFLE(3, 1, 2, 0));
+      v_tmp[1] = _mm256_permute4x64_epi64(v_tmp16_lo[1], _MM_SHUFFLE(3, 1, 2, 0));
+      v_tmp[2] = _mm256_permute4x64_epi64(v_tmp16_hi[0], _MM_SHUFFLE(3, 1, 2, 0));
+      v_tmp[3] = _mm256_permute4x64_epi64(v_tmp16_hi[1], _MM_SHUFFLE(3, 1, 2, 0));
+
+      _mm256_store_si256((__m256i*)(state->m_absLevelsAndCtxInit[state_offset] + 8),  _mm256_permute2x128_si256(v_tmp[0], v_tmp[1], 0x20));
+      _mm256_store_si256((__m256i*)(state->m_absLevelsAndCtxInit[state_offset + 1] + 8),  _mm256_permute2x128_si256(v_tmp[0], v_tmp[1], 0x31));
+      _mm256_store_si256((__m256i*)(state->m_absLevelsAndCtxInit[state_offset + 2] + 8),  _mm256_permute2x128_si256(v_tmp[2], v_tmp[3], 0x20));
+      _mm256_store_si256((__m256i*)(state->m_absLevelsAndCtxInit[state_offset + 3] + 8),  _mm256_permute2x128_si256(v_tmp[2], v_tmp[3], 0x31));
+
+      for (int i = 0; i < 4; ++i) {
+        memset(state->m_absLevelsAndCtxInit[state_offset + i], 0, 16);
+      }
+    }
+
+    __m128i sum_num = _mm_and_si128(last, _mm_set1_epi32(7));
+    __m128i sum_abs1 = _mm_and_si128(
+      _mm_srli_epi32(last, 3),
+      _mm_set1_epi32(31));
+
+    __m128i sum_abs_min = _mm_min_epi32(
+      _mm_set1_epi32(3),
+      _mm_srli_epi32(
+        _mm_add_epi32(sum_abs1, _mm_set1_epi32(1)),
+        1));
+
+    __m128i offsets = _mm_set_epi32(12 * 3, 12 * 2, 12 * 1, 12 * 0);
+    offsets = _mm_add_epi32(offsets, _mm_set1_epi32(sigCtxOffsetNext));
+    offsets         = _mm_add_epi32(offsets, sum_abs_min);
+    __m256i sig_frac_bits = _mm256_i32gather_epi64((const int64_t *)&state->m_sigFracBitsArray[state_offset][0][0], offsets, 8);
+    _mm256_store_si256((__m256i*)&state->m_sigFracBits[state_offset][0], sig_frac_bits);
+
+
+    __m128i sum_gt1 = _mm_sub_epi32(sum_abs1, sum_num);
+    __m128i min_gt1 = _mm_min_epi32(sum_gt1, _mm_set1_epi32(4));
+    uint32_t sum_gt1_s[4];
+    _mm_storeu_si128((__m128i*)sum_gt1_s, min_gt1);
+    for (int i = 0; i < 4; ++i) {
+      memcpy(state->m_coeffFracBits[state_offset + i], state->m_gtxFracBitsArray[sum_gt1_s[i] + gtxCtxOffsetNext], sizeof(state->m_coeffFracBits[0]));
+    }
+  }
+  else {
+    for (int i = 0; i < 4; i++) {
+      uvg_dep_quant_update_state_eos(
+        ctxs,
+        scan_pos,
+        cg_pos,
+        sigCtxOffsetNext,
+        gtxCtxOffsetNext,
+        width_in_sbb,
+        height_in_sbb,
+        next_sbb_right,
+        next_sbb_below,
+        decisions,
+        i);
+    }
+  }
+}
+
+static INLINE void update_states_avx2(
+  context_store*  ctxs,
+  int             numIPos,
+  const uint32_t  scan_pos,
+  const Decision* decisions,
+  const uint32_t  sigCtxOffsetNext,
+  const uint32_t  gtxCtxOffsetNext,
+  const NbInfoSbb next_nb_info_ssb,
+  const int       baseLevel,
+  const bool      extRiceFlag)
+{
+  all_depquant_states* state = &ctxs->m_allStates;
+
+  bool all_non_negative = true;
+  bool all_above_minus_two = true;
+  bool all_minus_one = true;
+  for (int i = 0; i < 4; ++i) {
+    all_non_negative &= decisions->prevId[i] >= 0;
+    all_above_minus_two &= decisions->prevId[i] > -2;
+    all_minus_one &= decisions->prevId[i] == -1;
+  }
+  int state_offset = ctxs->m_curr_state_offset;
+  __m256i rd_cost = _mm256_load_si256((__m256i const*)decisions->rdCost);
+  _mm256_store_si256((__m256i *)& ctxs->m_allStates.m_rdCost[state_offset], rd_cost);
+  if (all_above_minus_two) {
+
+    bool    rem_reg_all_gte_4 = true;
+    bool    rem_reg_all_lt4 = true;
+
+    __m128i abs_level = _mm_load_si128((__m128i const*)decisions->absLevel);
+    if (all_non_negative) {
+      __m128i prv_states  = _mm_load_si128((__m128i const*)decisions->prevId);
+      __m128i prev_offset = _mm_set1_epi32(ctxs->m_prev_state_offset);
+      prv_states = _mm_add_epi32(prv_states, prev_offset);
+      __m128i control = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+      __m128i shuffled_prev_states = _mm_shuffle_epi8(prv_states, control);
+      
+      __m128i sig_sbb   = _mm_load_si128((__m128i const*)state->m_numSigSbb);
+      sig_sbb = _mm_shuffle_epi8(sig_sbb, shuffled_prev_states);
+      __m128i has_coeff = _mm_min_epi32(abs_level, _mm_set1_epi32(1));
+      has_coeff         = _mm_shuffle_epi8(has_coeff, control);
+      sig_sbb           = _mm_or_si128(sig_sbb, has_coeff);
+      int sig_sbb_i = _mm_extract_epi32(sig_sbb, 0);
+      memcpy(&state->m_numSigSbb[state_offset], &sig_sbb_i, 4);
+      
+      __m128i ref_sbb_ctx_idx = _mm_load_si128((__m128i const*)state->m_refSbbCtxId);
+      ref_sbb_ctx_idx = _mm_shuffle_epi8(ref_sbb_ctx_idx, shuffled_prev_states);
+      int ref_sbb_ctx = _mm_extract_epi32(ref_sbb_ctx_idx, 0);
+      memcpy(&state->m_refSbbCtxId[state_offset], &ref_sbb_ctx, 4);
+      
+      __m128i go_rice_par = _mm_load_si128((__m128i const*)state->m_goRicePar);
+      go_rice_par = _mm_shuffle_epi8(go_rice_par, shuffled_prev_states);
+      int go_rice_par_i = _mm_extract_epi32(go_rice_par, 0);
+      memcpy(&state->m_goRicePar[state_offset], &go_rice_par_i, 4);
+
+      
+      __m256i sbb_frac_bits = _mm256_i32gather_epi64((const int64_t *)state->m_sbbFracBits[0], prv_states, 8);
+      _mm256_store_si256((__m256i*)&state->m_sbbFracBits[state_offset][0], sbb_frac_bits);
+
+      __m128i rem_reg_bins = _mm_i32gather_epi32(state->m_remRegBins, prv_states, 4);
+      __m128i ones = _mm_set1_epi32(1);
+      rem_reg_bins = _mm_sub_epi32(rem_reg_bins, ones);
+
+      __m128i reg_bins_sub = _mm_set1_epi32(0);
+      __m128i abs_level_smaller_than_two = _mm_cmplt_epi32(abs_level, _mm_set1_epi32(2));
+      __m128i secondary = _mm_blendv_epi8(_mm_set1_epi32(3), abs_level, abs_level_smaller_than_two);
+
+      __m128i rem_reg_bins_smaller_than_four = _mm_cmplt_epi32(rem_reg_bins, _mm_set1_epi32(4));
+      reg_bins_sub = _mm_blendv_epi8(secondary, reg_bins_sub, rem_reg_bins_smaller_than_four);
+      rem_reg_bins = _mm_sub_epi32(rem_reg_bins, reg_bins_sub);
+      _mm_store_si128((__m128i*)&state->m_remRegBins[state_offset], rem_reg_bins);
+
+      __m128i mask = _mm_cmpgt_epi32(rem_reg_bins, _mm_set1_epi32(3)); 
+      int     bit_mask = _mm_movemask_epi8(mask);           
+      rem_reg_all_gte_4 = (bit_mask == 0xFFFF);
+      mask = _mm_cmplt_epi32(rem_reg_bins, _mm_set1_epi32(4));
+      bit_mask = _mm_movemask_epi8(mask); 
+      rem_reg_all_lt4 = (bit_mask == 0xFFFF);
+
+      int32_t prv_states_scalar[4];
+      _mm_storeu_si128((__m128i*)prv_states_scalar, prv_states);
+      for (int i = 0; i < 4; ++i) {
+        memcpy(state->m_absLevelsAndCtxInit[state_offset + i], state->m_absLevelsAndCtxInit[prv_states_scalar[i]], 48 * sizeof(uint8_t));        
+      }
+    }
+    else if (all_minus_one) {
+      memset(&state->m_numSigSbb[state_offset], 1, 4);
+      memset(&state->m_refSbbCtxId[state_offset], -1, 4);
+
+      const int a = (state->effWidth * state->effHeight * 28) / 16;
+
+      __m128i   rem_reg_bins = _mm_set1_epi32(a);
+      __m128i   sub = _mm_blendv_epi8(
+        _mm_set1_epi32(3),
+        abs_level,
+        _mm_cmplt_epi32(abs_level, _mm_set1_epi32(2))
+      );
+      rem_reg_bins = _mm_sub_epi32(rem_reg_bins, sub);
+      _mm_store_si128((__m128i*) & state->m_remRegBins[state_offset], rem_reg_bins);
+
+      __m128i mask = _mm_cmpgt_epi32(rem_reg_bins, _mm_set1_epi32(3));
+      int     bit_mask = _mm_movemask_epi8(mask);
+      rem_reg_all_gte_4 = (bit_mask == 0xFFFF);
+      mask = _mm_cmplt_epi32(rem_reg_bins, _mm_set1_epi32(4));
+      bit_mask = _mm_movemask_epi8(mask);
+      rem_reg_all_lt4 = (bit_mask == 0xFFFF);
+      
+      memset(state->m_absLevelsAndCtxInit[state_offset], 0, 48 * sizeof(uint8_t) * 4);
+      
+    }
+    else {
+      for (int i = 0; i< 4; ++i) {
+        const int decision_id = i;
+        const int state_id = state_offset + i;
+        if (decisions->prevId[decision_id] >= 0) {
+          const int prvState = ctxs->m_prev_state_offset + decisions->prevId[decision_id];
+          state->m_numSigSbb[state_id] = (state->m_numSigSbb[prvState]) || !!decisions->absLevel[decision_id];
+          state->m_refSbbCtxId[state_id] = state->m_refSbbCtxId[prvState];
+          state->m_sbbFracBits[state_id][0] = state->m_sbbFracBits[prvState][0];
+          state->m_sbbFracBits[state_id][1] = state->m_sbbFracBits[prvState][1];
+          state->m_remRegBins[state_id] = state->m_remRegBins[prvState] - 1;
+          state->m_goRicePar[state_id] = state->m_goRicePar[prvState];
+          if (state->m_remRegBins[state_id] >= 4) {
+            state->m_remRegBins[state_id] -= (decisions->absLevel[decision_id] < 2 ? (unsigned)decisions->absLevel[decision_id] : 3);
+          }
+          memcpy(state->m_absLevelsAndCtxInit[state_id], state->m_absLevelsAndCtxInit[prvState], 48 * sizeof(uint8_t));
+        } else {
+          state->m_numSigSbb[state_id] = 1;
+          state->m_refSbbCtxId[state_id] = -1;
+          int ctxBinSampleRatio = 28;
+          //(scanInfo.chType == CHANNEL_TYPE_LUMA) ? MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_LUMA : MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_CHROMA;
+          state->m_remRegBins[state_id] = (state->effWidth * state->effHeight * ctxBinSampleRatio) / 16 - (decisions->absLevel[decision_id] < 2 ? (unsigned)decisions->absLevel[decision_id] : 3);
+          memset(state->m_absLevelsAndCtxInit[state_id], 0, 48 * sizeof(uint8_t));
+        }
+        rem_reg_all_gte_4 &= state->m_remRegBins[state_id] >= 4;
+        rem_reg_all_lt4 &= state->m_remRegBins[state_id] < 4;
+      }
+    }
+    uint32_t level_offset = scan_pos & 15;
+    __m128i   max_abs = _mm_min_epi32(abs_level, _mm_set1_epi32(32));
+    uint32_t max_abs_s[4];
+    _mm_storeu_si128((__m128i*)max_abs_s, max_abs);
+    for (int i = 0; i < 4; ++i) {
+      uint8_t* levels = (uint8_t*)state->m_absLevelsAndCtxInit[state_offset + i];
+      levels[level_offset] = max_abs_s[i];
+    }
+    state->all_gte_four = rem_reg_all_gte_4;
+    state->all_lt_four = rem_reg_all_lt4;
+    if (rem_reg_all_gte_4) {
+      const __m128i  first_two_bytes = _mm_set1_epi32(0xffff);
+      const __m128i  first_byte = _mm_set1_epi32(0xff);
+      const __m128i  ones = _mm_set1_epi32(1);
+      const uint32_t tinit_offset = MIN(level_offset - 1u, 15u) + 8;
+      const __m128i levels_start_offsets = _mm_set_epi32(48 * 3, 48 * 2, 48 * 1, 48 * 0);
+      const __m128i ctx_start_offsets = _mm_srli_epi32(levels_start_offsets, 1);
+      __m128i        tinit = _mm_i32gather_epi32(
+        (int *)state->m_absLevelsAndCtxInit[state_offset],
+        _mm_add_epi32(ctx_start_offsets, _mm_set1_epi32(tinit_offset)),
+        2);
+      tinit = _mm_and_si128(tinit, first_two_bytes);
+      __m128i sum_abs1 = _mm_and_si128(_mm_srli_epi32(tinit, 3), _mm_set1_epi32(31));
+      __m128i sum_num = _mm_and_si128(tinit, _mm_set1_epi32(7));
+
+      uint8_t* levels = (uint8_t*)state->m_absLevelsAndCtxInit[state_offset];
+      switch (numIPos) {
+      case 5:
+        {
+          __m128i t = _mm_i32gather_epi32(
+            (int *)levels,
+            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[4])),
+            1);
+          t = _mm_and_si128(t, first_byte);
+          __m128i min_arg = _mm_min_epi32(
+            _mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)),
+            t
+          );
+          sum_abs1 = _mm_add_epi32(
+            sum_abs1,
+            min_arg
+          );
+          sum_num = _mm_add_epi32(
+            sum_num,
+            _mm_min_epi32(_mm_and_si128(t, first_byte), ones));
+        }
+      case 4:
+        {
+          __m128i t = _mm_i32gather_epi32(
+            (int*)levels,
+            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[3])),
+            1);
+          t = _mm_and_si128(t, first_byte);
+          __m128i min_arg = _mm_min_epi32(
+            _mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)),
+            t
+          );
+          sum_abs1 = _mm_add_epi32(
+            sum_abs1,
+            min_arg
+          );
+          sum_num = _mm_add_epi32(
+            sum_num,
+            _mm_min_epi32(_mm_and_si128(t, first_byte), ones));
+        }
+      case 3:
+        {
+          __m128i t = _mm_i32gather_epi32(
+            (int*)levels,
+            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[2])),
+            1);
+          t = _mm_and_si128(t, first_byte);
+          __m128i min_arg = _mm_min_epi32(
+            _mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)),
+            t
+          );
+          sum_abs1 = _mm_add_epi32(
+            sum_abs1,
+            min_arg
+          );
+          sum_num = _mm_add_epi32(
+            sum_num,
+            _mm_min_epi32(_mm_and_si128(t, first_byte), ones));
+        }
+      case 2:
+        {
+          __m128i t = _mm_i32gather_epi32(
+            (int*)levels,
+            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[1])),
+            1);
+          t = _mm_and_si128(t, first_byte);
+        __m128i min_arg = _mm_min_epi32(
+              _mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)),
+              t
+            );
+          sum_abs1 = _mm_add_epi32(
+            sum_abs1,
+            min_arg
+          );
+          sum_num = _mm_add_epi32(
+            sum_num,
+            _mm_min_epi32(_mm_and_si128(t, first_byte), ones));
+        }
+      case 1: {
+          __m128i t = _mm_i32gather_epi32(
+            (int*)levels,
+            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[0])),
+            1);
+          t = _mm_and_si128(t, first_byte);
+          __m128i min_arg = _mm_min_epi32(
+            _mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)),
+            t
+          );
+          sum_abs1 = _mm_add_epi32(
+            sum_abs1,
+            min_arg
+            );
+          sum_num = _mm_add_epi32(
+            sum_num,
+            _mm_min_epi32(_mm_and_si128(t, first_byte), ones));
+        } break;
+      default:
+          assert(0);
+      }
+      __m128i sum_gt1 = _mm_sub_epi32(sum_abs1, sum_num);
+      __m128i  offsets = _mm_set_epi32(12 * 3, 12 * 2, 12 * 1, 12 * 0);
+      offsets = _mm_add_epi32(offsets, _mm_set1_epi32(sigCtxOffsetNext));
+      __m128i temp = _mm_min_epi32(
+        _mm_srli_epi32(_mm_add_epi32(sum_abs1, ones), 1),
+        _mm_set1_epi32(3));
+      offsets = _mm_add_epi32(offsets, temp);
+      __m256i sig_frac_bits = _mm256_i32gather_epi64((const int64_t *)state->m_sigFracBitsArray[state_offset][0], offsets, 8);
+      _mm256_store_si256((__m256i*)&state->m_sigFracBits[state_offset][0], sig_frac_bits);
+
+      sum_gt1 = _mm_min_epi32(sum_gt1, _mm_set1_epi32(4));
+      sum_gt1 = _mm_add_epi32(sum_gt1, _mm_set1_epi32(gtxCtxOffsetNext));
+      uint32_t sum_gt1_s[4];
+      _mm_storeu_si128((__m128i*)sum_gt1_s, sum_gt1);
+      for (int i = 0; i < 4; ++i) {
+        memcpy(state->m_coeffFracBits[state_offset + i], state->m_gtxFracBitsArray[sum_gt1_s[i]], sizeof(state->m_coeffFracBits[0]));
+      }
+
+      __m128i sum_abs = _mm_srli_epi32(tinit, 8);
+      sum_abs = _mm_min_epi32(sum_abs, _mm_set1_epi32(32));
+      switch (numIPos) {
+        case 5:
+          {
+          __m128i t = _mm_i32gather_epi32(
+            (int*)levels,
+            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[4])),
+            1);
+          sum_abs = _mm_add_epi32(t, sum_abs);
+          }
+        case 4:
+          {
+          __m128i t = _mm_i32gather_epi32(
+            (int*)levels,
+            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[3])),
+            1);
+          sum_abs = _mm_add_epi32(t, sum_abs);
+          }
+        case 3:
+          {
+          __m128i t = _mm_i32gather_epi32(
+            (int*)levels,
+            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[2])),
+            1);
+          sum_abs = _mm_add_epi32(t, sum_abs);
+          }
+        case 2:
+          {
+          __m128i t = _mm_i32gather_epi32(
+            (int*)levels,
+            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[1])),
+            1);
+          sum_abs = _mm_add_epi32(t, sum_abs);
+          }
+        case 1:
+          {
+          __m128i t = _mm_i32gather_epi32(
+            (int*)levels,
+            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[0])),
+            1);
+          sum_abs = _mm_add_epi32(t, sum_abs);
+          } break;
+        default:
+          assert(0);
+      }
+      sum_abs = _mm_and_si128(sum_abs, first_byte);
+      if (extRiceFlag) {
+        assert(0 && "Not implemented for avx2");
+      } else {
+        __m128i sum_all = _mm_max_epi32(
+          _mm_min_epi32(
+            _mm_set1_epi32(31),
+            _mm_sub_epi32(sum_abs, _mm_set1_epi32(20))),
+          _mm_set1_epi32(0));
+        __m128i temp = _mm_i32gather_epi32(g_goRiceParsCoeff, sum_all, 4);
+        __m128i control = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+        __m128i go_rice_par = _mm_shuffle_epi8(temp, control);
+        int     go_rice_par_i = _mm_extract_epi32(go_rice_par, 0);
+        memcpy(&state->m_goRicePar[state_offset], &go_rice_par_i, 4);
+      }
+    }
+
+    else if (rem_reg_all_lt4) {
+      uint8_t*       levels = (uint8_t*)state->m_absLevelsAndCtxInit[state_offset];
+      const __m128i  last_two_bytes = _mm_set1_epi32(0xffff);
+      const __m128i  last_byte = _mm_set1_epi32(0xff);
+      const uint32_t tinit_offset = MIN(level_offset - 1u, 15u) + 8;
+      const __m128i levels_start_offsets = _mm_set_epi32(48 * 3, 48 * 2, 48 * 1, 48 * 0);
+      const __m128i ctx_start_offsets = _mm_srli_epi32(levels_start_offsets, 1);
+      __m128i       tinit = _mm_i32gather_epi32(
+        (int*)state->m_absLevelsAndCtxInit[state_offset],
+        _mm_add_epi32(ctx_start_offsets, _mm_set1_epi32(tinit_offset)),
+        2);
+      tinit = _mm_and_si128(tinit, last_two_bytes);
+      __m128i sum_abs = _mm_srli_epi32(tinit, 8);
+      switch (numIPos) {
+        case 5: {
+          __m128i t = _mm_i32gather_epi32(
+            (int*)levels,
+            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[4])),
+            1);
+          t = _mm_and_si128(t, last_byte);
+          sum_abs = _mm_add_epi32(sum_abs, t);
+        }
+        case 4: {
+          __m128i t = _mm_i32gather_epi32(
+            (int*)levels,
+            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[3])),
+            1);
+          t = _mm_and_si128(t, last_byte);
+          sum_abs = _mm_add_epi32(sum_abs, t);
+        }
+        case 3: {
+          __m128i t = _mm_i32gather_epi32(
+            (int*)levels,
+            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[2])),
+            1);
+          t = _mm_and_si128(t, last_byte);
+          sum_abs = _mm_add_epi32(sum_abs, t);
+        }
+        case 2: {
+          __m128i t = _mm_i32gather_epi32(
+            (int*)levels,
+            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[1])),
+            1);
+          t = _mm_and_si128(t, last_byte);
+          sum_abs = _mm_add_epi32(sum_abs, t);
+        }
+        case 1: {
+          __m128i t = _mm_i32gather_epi32(
+            (int*)levels,
+            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[0])),
+            1);
+          t = _mm_and_si128(t, last_byte);
+          sum_abs = _mm_add_epi32(sum_abs, t);
+        } break;
+        default:
+          assert(0);
+      }
+      if (extRiceFlag) {
+        assert(0 && "Not implemented for avx2");
+      } else {
+        __m128i sum_all = _mm_min_epi32(_mm_set1_epi32(31), sum_abs);
+        __m128i temp = _mm_i32gather_epi32(g_goRiceParsCoeff, sum_all, 4);
+        __m128i control = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+        __m128i go_rice_par = _mm_shuffle_epi8(temp, control);
+        int     go_rice_par_i = _mm_extract_epi32(go_rice_par, 0);
+        memcpy(&state->m_goRicePar[state_offset], &go_rice_par_i, 4);
+
+        
+        for (int i = 0; i < 4; ++i) {
+          state->m_goRiceZero[state_offset + i] = (i < 2 ? 1 : 2) << state->m_goRicePar[state_offset + i];
+          
+        }
+
+      }
+
+    }
+    else {
+      for (int i = 0; i < 4; ++i) {
+        const int state_id = state_offset + i;
+        uint8_t*  levels = (uint8_t*)(state->m_absLevelsAndCtxInit[state_id]);
+        if (state->m_remRegBins[state_id] >= 4) {
+          coeff_t tinit = state->m_absLevelsAndCtxInit[state_id][8 + ((scan_pos - 1) & 15)];
+          coeff_t sumAbs1 = (tinit >> 3) & 31;
+          coeff_t sumNum = tinit & 7;
+#define UPDATE(k)                                  \
+  {                                                \
+    coeff_t t = levels[next_nb_info_ssb.inPos[k]]; \
+    sumAbs1 += MIN(4 + (t & 1), t);                \
+    sumNum += !!t;                                 \
+  }
+          switch (numIPos) {
+            case 5: UPDATE(4);
+            case 4: UPDATE(3);
+            case 3: UPDATE(2);
+            case 2: UPDATE(1);
+            case 1: UPDATE(0); break;
+            default: assert(0);
+          }
+#undef UPDATE
+          coeff_t sumGt1 = sumAbs1 - sumNum;
+          state->m_sigFracBits[state_id][0] = state->m_sigFracBitsArray[state_id][sigCtxOffsetNext + MIN((sumAbs1 + 1) >> 1, 3)][0];
+          state->m_sigFracBits[state_id][1] = state->m_sigFracBitsArray[state_id][sigCtxOffsetNext + MIN((sumAbs1 + 1) >> 1, 3)][1];
+          memcpy(state->m_coeffFracBits[state_id], state->m_gtxFracBitsArray[gtxCtxOffsetNext + (sumGt1 < 4 ? sumGt1 : 4)], sizeof(state->m_coeffFracBits[0]));
+
+
+          coeff_t sumAbs = state->m_absLevelsAndCtxInit[state_id][8 + ((scan_pos - 1) & 15)] >> 8;
+#define UPDATE(k)                                  \
+  {                                                \
+    coeff_t t = levels[next_nb_info_ssb.inPos[k]]; \
+    sumAbs += t;                                   \
+  }
+          switch (numIPos) {
+            case 5: UPDATE(4);
+            case 4: UPDATE(3);
+            case 3: UPDATE(2);
+            case 2: UPDATE(1);
+            case 1: UPDATE(0); break;
+            default: assert(0);
+          }
+#undef UPDATE
+          if (extRiceFlag) {
+            assert(0 && "Not implemented for avx2");
+          } else {
+            int sumAll = MAX(MIN(31, (int)sumAbs - 4 * 5), 0);
+            state->m_goRicePar[state_id] = g_goRiceParsCoeff[sumAll];
+          }
+        } else {
+          coeff_t sumAbs = (state->m_absLevelsAndCtxInit[state_id][8 + ((scan_pos - 1) & 15)]) >> 8;
+#define UPDATE(k)                                  \
+  {                                                \
+    coeff_t t = levels[next_nb_info_ssb.inPos[k]]; \
+    sumAbs += t;                                   \
+  }
+          switch (numIPos) {
+            case 5: UPDATE(4);
+            case 4: UPDATE(3);
+            case 3: UPDATE(2);
+            case 2: UPDATE(1);
+            case 1: UPDATE(0); break;
+            default: assert(0);
+          }
+#undef UPDATE
+          if (extRiceFlag) {
+            assert(0 && "Not implemented for avx2");
+          } else {
+            sumAbs = MIN(31, sumAbs);
+            state->m_goRicePar[state_id] = g_goRiceParsCoeff[sumAbs];
+          }
+          state->m_goRiceZero[state_id] = ((state_id & 3) < 2 ? 1 : 2) << state->m_goRicePar[state_id];
+        }
+      }
+    }
+  } else {
+    for (int i = 0; i < 4; ++i) {
+      state->all_gte_four = true;
+      state->all_lt_four = true;
+      uvg_dep_quant_update_state(
+        ctxs,
+        numIPos,
+        scan_pos,
+        decisions,
+        sigCtxOffsetNext,
+        gtxCtxOffsetNext,
+        next_nb_info_ssb,
+        baseLevel,
+        extRiceFlag,
+        i);
+    }
+  }
+}
+
+void uvg_dep_quant_decide_and_update_avx2(
+  rate_estimator_t*                         re,
+  context_store*                          ctxs,
+  struct dep_quant_scan_info const* const scan_info,
+  const coeff_t                           absCoeff,
+  const uint32_t                          scan_pos,
+  const uint32_t                          width_in_sbb,
+  const uint32_t                          height_in_sbb,
+  const NbInfoSbb                         next_nb_info_ssb,
+  bool                                    zeroOut,
+  coeff_t                                 quantCoeff,
+  const uint32_t                          effWidth,
+  const uint32_t                          effHeight,
+  bool                                    is_chroma)
+{
+  Decision* decisions = &ctxs->m_trellis[scan_pos];
+  SWAP(ctxs->m_curr_state_offset, ctxs->m_prev_state_offset, int);
+
+  enum ScanPosType spt = 0;
+  if ((scan_pos & 15) == 15 && scan_pos > 16 && scan_pos < effHeight * effWidth - 1)
+  {
+    spt = SCAN_SOCSBB;
+  }
+  else if ((scan_pos & 15) == 0 && scan_pos > 0 && scan_pos < effHeight * effWidth - 16)
+  {
+    spt = SCAN_EOCSBB;
+  }
+
+  xDecide(&ctxs->m_allStates, &ctxs->m_startState, ctxs->m_quant, spt, absCoeff, re->m_lastBitsX[scan_info->pos_x] + re->m_lastBitsY[scan_info->pos_y], decisions, zeroOut, quantCoeff,ctxs->m_skip_state_offset, ctxs->m_prev_state_offset);
+
+  if (scan_pos) {
+    if (!(scan_pos & 15)) {
+      SWAP(ctxs->m_common_context.m_curr_sbb_ctx_offset, ctxs->m_common_context.m_prev_sbb_ctx_offset, int);
+      update_state_eos_avx2(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions);
+      memcpy(decisions->prevId + 4, decisions->prevId, 4 * sizeof(int32_t));
+      memcpy(decisions->absLevel + 4, decisions->absLevel, 4 * sizeof(int32_t));
+      memcpy(decisions->rdCost + 4, decisions->rdCost, 4 * sizeof(int64_t));
+    } else if (!zeroOut) {
+      update_states_avx2(ctxs, next_nb_info_ssb.num, scan_pos, decisions, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], next_nb_info_ssb, 4, false);
+    }
+
+    if (spt == SCAN_SOCSBB) {
+      SWAP(ctxs->m_skip_state_offset, ctxs->m_prev_state_offset, int);
+    }
+  }
+}
+
+
+#endif //COMPILE_INTEL_AVX2 && defined X86_64
+
+int uvg_strategy_register_depquant_avx2(void* opaque, uint8_t bitdepth)
+{
+  bool success = true;
+
+#if COMPILE_INTEL_AVX2 && defined X86_64
+  success &= uvg_strategyselector_register(opaque, "dep_quant_decide_and_update", "avx2", 40, &uvg_dep_quant_decide_and_update_avx2);
+#endif //COMPILE_INTEL_AVX2 && defined X86_64
+
+  return success;
+}
diff --git a/src/strategies/avx2/depquant-avx2.h b/src/strategies/avx2/depquant-avx2.h
new file mode 100644
index 00000000..e6db110c
--- /dev/null
+++ b/src/strategies/avx2/depquant-avx2.h
@@ -0,0 +1,46 @@
+#ifndef STRATEGIES_DEPQUANT_AVX2_H_
+#define STRATEGIES_DEPQUANT_AVX2_H_
+/*****************************************************************************
+ * This file is part of uvg266 VVC encoder.
+ *
+ * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ * 
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ * 
+ * * Redistributions in binary form must reproduce the above copyright notice, this
+ *   list of conditions and the following disclaimer in the documentation and/or
+ *   other materials provided with the distribution.
+ * 
+ * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
+ ****************************************************************************/
+
+/**
+ * \ingroup Optimization
+ * \file
+ * Optimizations for AVX2.
+ */
+
+#include "global.h" // IWYU pragma: keep
+
+
+int uvg_strategy_register_depquant_avx2(void* opaque, uint8_t bitdepth);
+
+#endif //STRATEGIES_DEPQUANT_AVX2_H_
diff --git a/src/strategies/generic/depquant-generic.c b/src/strategies/generic/depquant-generic.c
new file mode 100644
index 00000000..aa2ea99e
--- /dev/null
+++ b/src/strategies/generic/depquant-generic.c
@@ -0,0 +1,238 @@
+/*****************************************************************************
+ * This file is part of uvg266 VVC encoder.
+ *
+ * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ * 
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ * 
+ * * Redistributions in binary form must reproduce the above copyright notice, this
+ *   list of conditions and the following disclaimer in the documentation and/or
+ *   other materials provided with the distribution.
+ * 
+ * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
+ ****************************************************************************/
+
+#include "strategies/generic/depquant-generic.h"
+
+#include "dep_quant.h"
+
+#include "cu.h"
+#include "encoderstate.h"
+#include "intra.h"
+#include "rdo.h"
+#include "strategyselector.h"
+#include "transform.h"
+#include "uvg_math.h"
+#include "generic/quant-generic.h"
+static const int32_t g_goRiceBits[4][RICEMAX] = {
+  {32768,  65536,  98304,  131072, 163840, 196608, 262144, 262144,
+   327680, 327680, 327680, 327680, 393216, 393216, 393216, 393216,
+   393216, 393216, 393216, 393216, 458752, 458752, 458752, 458752,
+   458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752},
+  {65536,  65536,  98304,  98304,  131072, 131072, 163840, 163840,
+   196608, 196608, 229376, 229376, 294912, 294912, 294912, 294912,
+   360448, 360448, 360448, 360448, 360448, 360448, 360448, 360448,
+   425984, 425984, 425984, 425984, 425984, 425984, 425984, 425984},
+  {98304,  98304,  98304,  98304,  131072, 131072, 131072, 131072,
+   163840, 163840, 163840, 163840, 196608, 196608, 196608, 196608,
+   229376, 229376, 229376, 229376, 262144, 262144, 262144, 262144,
+   327680, 327680, 327680, 327680, 327680, 327680, 327680, 327680},
+  {131072, 131072, 131072, 131072, 131072, 131072, 131072, 131072,
+   163840, 163840, 163840, 163840, 163840, 163840, 163840, 163840,
+   196608, 196608, 196608, 196608, 196608, 196608, 196608, 196608,
+   229376, 229376, 229376, 229376, 229376, 229376, 229376, 229376},
+};
+
+
+static INLINE void checkRdCostSkipSbbZeroOut(
+  Decision* decision, 
+  const all_depquant_states* const state,
+  int decision_id, 
+  int skip_offset) {
+  int64_t rdCost = state->m_rdCost[decision_id + skip_offset] + state->m_sbbFracBits[decision_id + skip_offset][0];
+  decision->rdCost[decision_id] = rdCost;
+  decision->absLevel[decision_id] = 0;
+  decision->prevId[decision_id] = 4 + state->m_stateId[decision_id + skip_offset];
+}
+
+static INLINE void checkRdCostSkipSbb(const all_depquant_states* const state, Decision * decisions, int decision_id, int skip_offset)
+{
+  int64_t rdCost = state->m_rdCost[skip_offset + decision_id] + state->m_sbbFracBits[skip_offset + decision_id][0];
+  if (rdCost < decisions->rdCost[decision_id])
+  {
+    decisions->rdCost[decision_id] = rdCost;
+    decisions->absLevel[decision_id] = 0;
+    decisions->prevId[decision_id] = 4 + state->m_stateId[skip_offset + decision_id];
+  }
+}
+
+static INLINE void checkRdCostStart(const depquant_state* const state, int32_t lastOffset, const PQData *pqData, Decision *decisions, int
+                                    decision_id)
+{
+  int64_t rdCost = pqData->deltaDist[decision_id] + lastOffset;
+  if (pqData->absLevel[decision_id] < 4) {
+    rdCost += state->m_coeffFracBits[pqData->absLevel[decision_id]];
+  }
+  else {
+    const coeff_t value = (pqData->absLevel[decision_id] - 4) >> 1;
+    rdCost += state->m_coeffFracBits[pqData->absLevel[decision_id] - (value << 1)]
+              + g_goRiceBits[state->m_goRicePar][value < RICEMAX ? value : RICEMAX - 1];
+  }
+  if (rdCost < decisions->rdCost[decision_id]) {
+    decisions->rdCost[decision_id] = rdCost;
+    decisions->absLevel[decision_id] = pqData->absLevel[decision_id];
+    decisions->prevId[decision_id] = -1;
+  }
+}
+
+
+
+static const Decision startDec = { .rdCost = {INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2},
+  .absLevel = {-1, -1, -1, -1, 0, 0, 0, 0}, .prevId = {-2, -2, -2, -2, 4, 5, 6, 7} };
+
+static INLINE void preQuantCoeff(const quant_block * const qp, const coeff_t absCoeff, PQData* pqData, coeff_t quanCoeff)
+{
+  int64_t scaledOrg = (int64_t)(absCoeff) * quanCoeff;
+  coeff_t  qIdx = MAX(1, (coeff_t)MIN(qp->m_maxQIdx, ((scaledOrg + qp->m_QAdd) >> qp->m_QShift)));
+  int64_t scaledAdd = qIdx * qp->m_DistStepAdd - scaledOrg * qp->m_DistOrgFact;
+  int index = qIdx & 3;
+  pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
+  pqData->absLevel[index] = (++qIdx) >> 1;
+  scaledAdd += qp->m_DistStepAdd;
+  index = qIdx & 3;
+  pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
+  pqData->absLevel[index] = (++qIdx) >> 1;
+  scaledAdd += qp->m_DistStepAdd;
+  index = qIdx & 3;
+  pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
+  pqData->absLevel[index] = (++qIdx) >> 1;
+  scaledAdd += qp->m_DistStepAdd;
+  index = qIdx & 3;
+  pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
+  pqData->absLevel[index] = (++qIdx) >> 1;
+}
+
+static void xDecide(
+  all_depquant_states* const all_states,
+  depquant_state* const      m_startState,
+  quant_block*               qp,
+  const enum ScanPosType     spt,
+  const coeff_t              absCoeff,
+  const int                  lastOffset,
+  Decision*                  decisions,
+  bool                       zeroOut,
+  coeff_t                    quanCoeff,
+  const int                  skip_offset,
+  const int                  prev_offset)
+{
+  memcpy(decisions, &startDec, sizeof(Decision));
+
+  if (zeroOut) {
+    if (spt == SCAN_EOCSBB) {
+      checkRdCostSkipSbbZeroOut(decisions, all_states, 0, skip_offset);
+      checkRdCostSkipSbbZeroOut(decisions, all_states, 1, skip_offset);
+      checkRdCostSkipSbbZeroOut(decisions, all_states, 2, skip_offset);
+      checkRdCostSkipSbbZeroOut(decisions, all_states, 3, skip_offset);
+    }
+    return;
+  }
+
+  PQData pqData;
+  preQuantCoeff(qp, absCoeff, &pqData, quanCoeff);
+  uvg_dep_quant_check_rd_costs(all_states, spt, &pqData, decisions, 0, 2, prev_offset + 0);
+  uvg_dep_quant_check_rd_costs(all_states, spt, &pqData, decisions, 2, 0, prev_offset + 1);
+  uvg_dep_quant_check_rd_costs(all_states, spt, &pqData, decisions, 1, 3, prev_offset + 2);
+  uvg_dep_quant_check_rd_costs(all_states, spt, &pqData, decisions, 3, 1, prev_offset + 3);
+  if (spt == SCAN_EOCSBB) {
+    checkRdCostSkipSbb(all_states, decisions, 0, skip_offset);
+    checkRdCostSkipSbb(all_states, decisions, 1, skip_offset);
+    checkRdCostSkipSbb(all_states, decisions, 2, skip_offset);
+    checkRdCostSkipSbb(all_states, decisions, 3, skip_offset);
+  }
+
+  checkRdCostStart(m_startState, lastOffset, &pqData, decisions, 0);
+  checkRdCostStart(m_startState, lastOffset, &pqData, decisions, 2);
+}
+
+
+static void uvg_dep_quant_decide_and_update_generic(
+  rate_estimator_t*                         re,
+  context_store*                          ctxs,
+  struct dep_quant_scan_info const* const scan_info,
+  const coeff_t                           absCoeff,
+  const uint32_t                          scan_pos,
+  const uint32_t                          width_in_sbb,
+  const uint32_t                          height_in_sbb,
+  const NbInfoSbb                         next_nb_info_ssb,
+  bool                                    zeroOut,
+  coeff_t                                 quantCoeff,
+  const uint32_t                          effWidth,
+  const uint32_t                          effHeight,
+  bool                                    is_chroma)
+{
+  Decision* decisions = &ctxs->m_trellis[scan_pos];
+  SWAP(ctxs->m_curr_state_offset, ctxs->m_prev_state_offset, int);
+
+  enum ScanPosType spt = 0;
+  if ((scan_pos & 15) == 15 && scan_pos > 16 && scan_pos < effHeight * effWidth - 1)
+  {
+    spt = SCAN_SOCSBB;
+  }
+  else if ((scan_pos & 15) == 0 && scan_pos > 0 && scan_pos < effHeight * effWidth - 16)
+  {
+    spt = SCAN_EOCSBB;
+  }
+
+  xDecide(&ctxs->m_allStates, &ctxs->m_startState, ctxs->m_quant, spt, absCoeff, re->m_lastBitsX[scan_info->pos_x] + re->m_lastBitsY[scan_info->pos_y], decisions, zeroOut, quantCoeff,ctxs->m_skip_state_offset, ctxs->m_prev_state_offset);
+
+  if (scan_pos) {
+    if (!(scan_pos & 15)) {
+      SWAP(ctxs->m_common_context.m_curr_sbb_ctx_offset, ctxs->m_common_context.m_prev_sbb_ctx_offset, int);
+      uvg_dep_quant_update_state_eos(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions, 0);
+      uvg_dep_quant_update_state_eos(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions, 1);
+      uvg_dep_quant_update_state_eos(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions, 2);
+      uvg_dep_quant_update_state_eos(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions, 3);
+      memcpy(decisions->prevId + 4, decisions->prevId, 4 * sizeof(int32_t));
+      memcpy(decisions->absLevel + 4, decisions->absLevel, 4 * sizeof(int32_t));
+      memcpy(decisions->rdCost + 4, decisions->rdCost, 4 * sizeof(int64_t));
+    } else if (!zeroOut) {
+      uvg_dep_quant_update_state(ctxs, next_nb_info_ssb.num, scan_pos, decisions, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], next_nb_info_ssb, 4, false, 0);
+      uvg_dep_quant_update_state(ctxs, next_nb_info_ssb.num, scan_pos, decisions, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], next_nb_info_ssb, 4, false, 1);
+      uvg_dep_quant_update_state(ctxs, next_nb_info_ssb.num, scan_pos, decisions, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], next_nb_info_ssb, 4, false, 2);
+      uvg_dep_quant_update_state(ctxs, next_nb_info_ssb.num, scan_pos, decisions, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], next_nb_info_ssb, 4, false, 3);
+    }
+
+    if (spt == SCAN_SOCSBB) {
+      SWAP(ctxs->m_skip_state_offset, ctxs->m_prev_state_offset, int);
+    }
+  }
+}
+
+
+int uvg_strategy_register_depquant_generic(void* opaque, uint8_t bitdepth)
+{
+  bool success = true;
+  
+  success &= uvg_strategyselector_register(opaque, "dep_quant_decide_and_update", "generic", 40, &uvg_dep_quant_decide_and_update_generic);
+
+
+  return success;
+}
diff --git a/src/strategies/generic/depquant-generic.h b/src/strategies/generic/depquant-generic.h
new file mode 100644
index 00000000..488963be
--- /dev/null
+++ b/src/strategies/generic/depquant-generic.h
@@ -0,0 +1,50 @@
+#ifndef STRATEGIES_DEPQUANT_GENERIC_H_
+#define STRATEGIES_DEPQUANT_GENERIC_H_
+/*****************************************************************************
+ * This file is part of uvg266 VVC encoder.
+ *
+ * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ * 
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ * 
+ * * Redistributions in binary form must reproduce the above copyright notice, this
+ *   list of conditions and the following disclaimer in the documentation and/or
+ *   other materials provided with the distribution.
+ * 
+ * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
+ ****************************************************************************/
+
+/**
+ * \ingroup Optimization
+ * \file
+ * Generic C implementations of optimized functions.
+ */
+
+#include "cu.h"
+#include "encoderstate.h"
+#include "global.h" // IWYU pragma: keep
+#include "uvg266.h"
+#include "tables.h"
+
+
+int uvg_strategy_register_depquant_generic(void* opaque, uint8_t bitdepth);
+
+#endif //STRATEGIES_DEPQUANT_GENERIC_H_
diff --git a/src/strategies/strategies-depquant.c b/src/strategies/strategies-depquant.c
new file mode 100644
index 00000000..7ba62163
--- /dev/null
+++ b/src/strategies/strategies-depquant.c
@@ -0,0 +1,54 @@
+/*****************************************************************************
+ * This file is part of uvg266 VVC encoder.
+ *
+ * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ * 
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ * 
+ * * Redistributions in binary form must reproduce the above copyright notice, this
+ *   list of conditions and the following disclaimer in the documentation and/or
+ *   other materials provided with the distribution.
+ * 
+ * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
+ ****************************************************************************/
+
+#include "strategies/strategies-depquant.h"
+
+#include "strategies/avx2/depquant-avx2.h"
+#include "strategies/generic/depquant-generic.h"
+#include "strategyselector.h"
+
+
+// Define function pointers.
+dep_quant_decide_and_update_func* uvg_dep_quant_decide_and_update;
+
+
+int uvg_strategy_register_depquant(void *opaque, uint8_t bitdepth)
+{
+  bool success = true;
+
+  success &= uvg_strategy_register_depquant_generic(opaque, bitdepth);
+
+  if (uvg_g_hardware_flags.intel_flags.avx2) {
+    success &= uvg_strategy_register_depquant_avx2(opaque, bitdepth);
+  }
+  return success;
+}
diff --git a/src/strategies/strategies-depquant.h b/src/strategies/strategies-depquant.h
new file mode 100644
index 00000000..4021c458
--- /dev/null
+++ b/src/strategies/strategies-depquant.h
@@ -0,0 +1,77 @@
+#ifndef STRATEGIES_DEPQUANT_H_
+#define STRATEGIES_DEPQUANT_H_
+/*****************************************************************************
+ * This file is part of uvg266 VVC encoder.
+ *
+ * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ * 
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ * 
+ * * Redistributions in binary form must reproduce the above copyright notice, this
+ *   list of conditions and the following disclaimer in the documentation and/or
+ *   other materials provided with the distribution.
+ * 
+ * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
+ ****************************************************************************/
+
+/**
+ * \ingroup Optimization
+ * \file
+ * Interface for sao functions.
+ */
+
+#include "encoder.h"
+#include "encoderstate.h"
+#include "global.h" // IWYU pragma: keep
+#include "uvg266.h"
+#include "dep_quant.h"
+
+
+// Declare function pointers.
+typedef int(dep_quant_decide_and_update_func)(
+  rate_estimator_t*                       re,
+  context_store*                          ctxs,
+  struct dep_quant_scan_info const* const scan_info,
+  const coeff_t                           absCoeff,
+  const uint32_t                          scan_pos,
+  const uint32_t                          width_in_sbb,
+  const uint32_t                          height_in_sbb,
+  const NbInfoSbb                         next_nb_info_ssb,
+  bool                                    zeroOut,
+  coeff_t                                 quantCoeff,
+  const uint32_t                          effWidth,
+  const uint32_t                          effHeight,
+  bool                                    is_chroma);
+
+
+
+// Declare function pointers.
+extern dep_quant_decide_and_update_func* uvg_dep_quant_decide_and_update;
+
+int uvg_strategy_register_depquant(void* opaque, uint8_t bitdepth);
+
+
+#define STRATEGIES_DEPQUANT_EXPORTS \
+  {"dep_quant_decide_and_update", (void**)&uvg_dep_quant_decide_and_update}, \
+
+
+
+#endif //STRATEGIES_DEPQUANT_H_
diff --git a/src/strategies/strategies-quant.c b/src/strategies/strategies-quant.c
index 89baf86e..62c75d6f 100644
--- a/src/strategies/strategies-quant.c
+++ b/src/strategies/strategies-quant.c
@@ -38,15 +38,16 @@
 
 
 // Define function pointers.
-quant_func *uvg_quant;
-quant_cbcr_func *uvg_quant_cbcr_residual;
-quant_residual_func *uvg_quantize_residual;
-dequant_func *uvg_dequant;
-coeff_abs_sum_func *uvg_coeff_abs_sum;
+quant_func           *uvg_quant;
+quant_cbcr_func      *uvg_quant_cbcr_residual;
+quant_residual_func  *uvg_quantize_residual;
+dequant_func         *uvg_dequant;
+coeff_abs_sum_func   *uvg_coeff_abs_sum;
 fast_coeff_cost_func *uvg_fast_coeff_cost;
 
 
-int uvg_strategy_register_quant(void* opaque, uint8_t bitdepth) {
+int uvg_strategy_register_quant(void *opaque, uint8_t bitdepth)
+{
   bool success = true;
 
   success &= uvg_strategy_register_quant_generic(opaque, bitdepth);
diff --git a/src/strategyselector.c b/src/strategyselector.c
index 477604a9..d6dffa4e 100644
--- a/src/strategyselector.c
+++ b/src/strategyselector.c
@@ -107,6 +107,10 @@ int uvg_strategyselector_init(int32_t cpuid, uint8_t bitdepth) {
     fprintf(stderr, "uvg_strategy_register_encode failed!\n");
     return 0;
   }
+  if (!uvg_strategy_register_depquant(&strategies, bitdepth)) {
+    fprintf(stderr, "uvg_strategy_register_depquant failed!\n");
+    return 0;
+  }
   
   while(cur_strategy_to_select->fptr) {
     *(cur_strategy_to_select->fptr) = strategyselector_choose_for(&strategies, cur_strategy_to_select->strategy_type);
diff --git a/src/strategyselector.h b/src/strategyselector.h
index caadfda9..8bbdfbed 100644
--- a/src/strategyselector.h
+++ b/src/strategyselector.h
@@ -108,6 +108,7 @@ int uvg_strategyselector_register(void *opaque, const char *type, const char *st
 #include "strategies/strategies-intra.h"
 #include "strategies/strategies-sao.h"
 #include "strategies/strategies-encode.h"
+#include "strategies/strategies-depquant.h"
 #include "strategies/strategies-alf.h"
 
 static const strategy_to_select_t strategies_to_select[] = {
@@ -120,6 +121,7 @@ static const strategy_to_select_t strategies_to_select[] = {
   STRATEGIES_SAO_EXPORTS
   STRATEGIES_ENCODE_EXPORTS
   STRATEGIES_ALF_EXPORTS
+  STRATEGIES_DEPQUANT_EXPORTS
   { NULL, NULL },
 };
 

From 48ea4bff4d538880dc712297ecf4210e88504548 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Fri, 21 Apr 2023 11:34:31 +0300
Subject: [PATCH 228/254] [dep_quant] Fix rate_estimator and quant_block init
 cases

---
 src/dep_quant.c    | 2 ++
 src/search.c       | 1 +
 src/search_intra.c | 1 +
 src/transform.c    | 4 +---
 4 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/dep_quant.c b/src/dep_quant.c
index 519e5795..cc107ddf 100644
--- a/src/dep_quant.c
+++ b/src/dep_quant.c
@@ -908,6 +908,8 @@ int uvg_dep_quant(
     init_rate_esimator(rate_estimator, &state->search_cabac, compID);
     xSetLastCoeffOffset(state, cur_tu, width, height, rate_estimator, compID);
     rate_estimator->needs_init = false;
+  } else if (compID == COLOR_U && state->encoder_control->cfg.jccr) {
+    xSetLastCoeffOffset(state, cur_tu, width, height, rate_estimator, compID);    
   }
 
   reset_common_context(&dep_quant_context.m_common_context, rate_estimator, (width * height) >> 4, numCoeff);
diff --git a/src/search.c b/src/search.c
index 755062ab..c353a914 100644
--- a/src/search.c
+++ b/src/search.c
@@ -1414,6 +1414,7 @@ static double search_cu(
           else {
             intra_search.pred_cu.intra.mode_chroma = 0;
           }
+          state->quant_blocks[2].needs_init = true;
           uvg_intra_recon_cu(state,
                              &intra_search, chroma_loc,
                              &intra_search.pred_cu, lcu,
diff --git a/src/search_intra.c b/src/search_intra.c
index 9d4c5da6..a644ed9c 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -1971,6 +1971,7 @@ void uvg_search_cu_intra(
     }
 
     state->quant_blocks[0].needs_init = 1;
+    state->rate_estimator[0].needs_init = 1;
     search_intra_rdo(
       state,
       number_of_modes_to_search,
diff --git a/src/transform.c b/src/transform.c
index 969394df..45846cf9 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -437,7 +437,7 @@ static void quantize_chroma(
   int8_t height = cu_loc->chroma_height;
   if(state->encoder_control->cfg.dep_quant && transform != CHROMA_TS) {
     int abs_sum = 0;
-    state->quant_blocks[1].needs_init = state->encoder_control->cfg.jccr;
+    state->quant_blocks[2].needs_init = state->encoder_control->cfg.jccr;
     uvg_dep_quant(
       state,
       cur_tu,
@@ -1561,8 +1561,6 @@ void uvg_quantize_lcu_residual(
     uvg_cu_loc_ctor(&loc, x, y, width, height);
 
     if (luma) {
-      state->quant_blocks[0].needs_init = true;
-      state->rate_estimator[0].needs_init = true;
       quantize_tr_residual(state, COLOR_Y, &loc, cur_pu, lcu, early_skip, tree_type);
     }
     double c_lambda = state->c_lambda;

From 2811ce58f4d8be230f913be7776b8c5e9098b382 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Sat, 22 Apr 2023 11:53:54 +0300
Subject: [PATCH 229/254] [avx2] AVX2 version of depquant now exactly matches
 scalar version

---
 src/dep_quant.c                     |  2 +-
 src/strategies/avx2/depquant-avx2.c | 20 ++++++++++++--------
 src/transform.c                     |  2 +-
 3 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/src/dep_quant.c b/src/dep_quant.c
index cc107ddf..c6c6aee9 100644
--- a/src/dep_quant.c
+++ b/src/dep_quant.c
@@ -664,7 +664,7 @@ void uvg_dep_quant_update_state_eos(
       memset(state->m_absLevelsAndCtxInit[curr_state_offset], 0, 16 * sizeof(uint8_t));
     }
     uint8_t* temp = (uint8_t*)(&state->m_absLevelsAndCtxInit[curr_state_offset][scan_pos & 15]);
-    *temp = (uint8_t)MIN(32, decisions->absLevel[decision_id]);
+    *temp = (uint8_t)MIN(51, decisions->absLevel[decision_id]);
 
     update_common_context(ctxs, state->m_commonCtx, scan_pos, cg_pos, width_in_sbb, height_in_sbb, next_sbb_right,
                           next_sbb_below, prvState, ctxs->m_curr_state_offset + decision_id);
diff --git a/src/strategies/avx2/depquant-avx2.c b/src/strategies/avx2/depquant-avx2.c
index 86056de4..5692f488 100644
--- a/src/strategies/avx2/depquant-avx2.c
+++ b/src/strategies/avx2/depquant-avx2.c
@@ -35,6 +35,7 @@
 */
 
 #include "strategies/avx2/depquant-avx2.h"
+#include "strategyselector.h"
 
 #if COMPILE_INTEL_AVX2 && defined X86_64
 #include "dep_quant.h"
@@ -352,13 +353,13 @@ static void check_rd_costs_avx2(const all_depquant_states* const state, const en
   __m256i cheaper_first = _mm256_blendv_epi8(rd_cost_a, rd_cost_b, a_vs_b);
   __m256i cheaper_first_data = _mm256_blendv_epi8(a_data, b_data, a_vs_b);
 
-  __m256i z_vs_decision = _mm256_cmpgt_epi64(rd_cost_z, rd_cost_decision);
-  __m256i cheaper_second = _mm256_blendv_epi8(rd_cost_z, rd_cost_decision, z_vs_decision);
-  __m256i cheaper_second_data = _mm256_blendv_epi8(z_data, decision_data, z_vs_decision);
+  __m256i z_vs_decision = _mm256_cmpgt_epi64(rd_cost_decision, rd_cost_z);
+  __m256i cheaper_second = _mm256_blendv_epi8(rd_cost_decision, rd_cost_z, z_vs_decision);
+  __m256i cheaper_second_data = _mm256_blendv_epi8(decision_data, z_data, z_vs_decision);
 
-  __m256i final_decision = _mm256_cmpgt_epi64(cheaper_first, cheaper_second);
-  __m256i final_rd_cost = _mm256_blendv_epi8(cheaper_first, cheaper_second, final_decision);
-  __m256i final_data = _mm256_blendv_epi8(cheaper_first_data, cheaper_second_data, final_decision);
+  __m256i final_decision = _mm256_cmpgt_epi64(cheaper_second, cheaper_first);
+  __m256i final_rd_cost = _mm256_blendv_epi8(cheaper_second, cheaper_first, final_decision);
+  __m256i final_data = _mm256_blendv_epi8(cheaper_second_data, cheaper_first_data, final_decision);
 
   _mm256_store_si256((__m256i*)decisions->rdCost, final_rd_cost);
   final_data = _mm256_permutevar8x32_epi32(final_data, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
@@ -952,7 +953,7 @@ static INLINE void update_states_avx2(
       }
     }
     uint32_t level_offset = scan_pos & 15;
-    __m128i   max_abs = _mm_min_epi32(abs_level, _mm_set1_epi32(32));
+    __m128i   max_abs = _mm_min_epi32(abs_level, _mm_set1_epi32(51));
     uint32_t max_abs_s[4];
     _mm_storeu_si128((__m128i*)max_abs_s, max_abs);
     for (int i = 0; i < 4; ++i) {
@@ -1094,7 +1095,7 @@ static INLINE void update_states_avx2(
       }
 
       __m128i sum_abs = _mm_srli_epi32(tinit, 8);
-      sum_abs = _mm_min_epi32(sum_abs, _mm_set1_epi32(32));
+      sum_abs = _mm_min_epi32(sum_abs, _mm_set1_epi32(51));
       switch (numIPos) {
         case 5:
           {
@@ -1103,6 +1104,9 @@ static INLINE void update_states_avx2(
             _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[4])),
             1);
           sum_abs = _mm_add_epi32(t, sum_abs);
+          // Need this to make sure we don't go beyond 255
+          sum_abs = _mm_and_si128(sum_abs, first_byte);
+          sum_abs = _mm_min_epi32(sum_abs, _mm_set1_epi32(51));
           }
         case 4:
           {
diff --git a/src/transform.c b/src/transform.c
index 45846cf9..77834072 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -437,7 +437,7 @@ static void quantize_chroma(
   int8_t height = cu_loc->chroma_height;
   if(state->encoder_control->cfg.dep_quant && transform != CHROMA_TS) {
     int abs_sum = 0;
-    state->quant_blocks[2].needs_init = state->encoder_control->cfg.jccr;
+    state->quant_blocks[2].needs_init |= state->encoder_control->cfg.jccr;
     uvg_dep_quant(
       state,
       cur_tu,

From b4c84e820cccfc7452d1e27448bcad5dd71cd244 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 26 Apr 2023 10:34:41 +0300
Subject: [PATCH 230/254] [avx2] Simplify

---
 src/dep_quant.c                     |  2 +-
 src/strategies/avx2/depquant-avx2.c | 86 +++++++++++------------------
 2 files changed, 34 insertions(+), 54 deletions(-)

diff --git a/src/dep_quant.c b/src/dep_quant.c
index c6c6aee9..99a15df3 100644
--- a/src/dep_quant.c
+++ b/src/dep_quant.c
@@ -656,7 +656,7 @@ void uvg_dep_quant_update_state_eos(
     }
     else if (decisions->prevId[decision_id] >= 0) {
       prvState = ctxs->m_prev_state_offset + decisions->prevId[decision_id];
-      state->m_numSigSbb[curr_state_offset] = state->m_numSigSbb[prvState] + !!decisions->absLevel[decision_id];
+      state->m_numSigSbb[curr_state_offset] = state->m_numSigSbb[prvState] || !!decisions->absLevel[decision_id];
       memcpy(state->m_absLevelsAndCtxInit[curr_state_offset], state->m_absLevelsAndCtxInit[prvState], 16 * sizeof(uint8_t));
     }
     else {
diff --git a/src/strategies/avx2/depquant-avx2.c b/src/strategies/avx2/depquant-avx2.c
index 5692f488..0e5f35d1 100644
--- a/src/strategies/avx2/depquant-avx2.c
+++ b/src/strategies/avx2/depquant-avx2.c
@@ -158,29 +158,20 @@ static void check_rd_costs_avx2(const all_depquant_states* const state, const en
 
     if (spt == SCAN_ISCSBB) {
       __m256i original = _mm256_loadu_si256((__m256i const*)state->m_sigFracBits[start]);
-      __m256i even_mask = _mm256_setr_epi32(0, 2, 4, 6, -1, -1, -1, -1);
-      __m256i odd_mask = _mm256_setr_epi32(1, 3, 5, 7, -1, -1, -1, -1);
-      __m256i even = _mm256_permutevar8x32_epi32(original, even_mask);
-      __m256i odd = _mm256_permutevar8x32_epi32(original, odd_mask);
-      __m256i even_64 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(even, 0));
-      __m256i odd_64 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(odd, 0));
-      rd_cost_a = _mm256_add_epi64(rd_cost_a, odd_64);
-      rd_cost_b = _mm256_add_epi64(rd_cost_b, odd_64);
-      rd_cost_z = _mm256_add_epi64(rd_cost_z, even_64);
+      __m256i even      = _mm256_and_si256(original, _mm256_set1_epi64x(0xffffffff));
+      __m256i odd  = _mm256_srli_epi64(original, 32);
+      rd_cost_a = _mm256_add_epi64(rd_cost_a, odd);
+      rd_cost_b = _mm256_add_epi64(rd_cost_b, odd);
+      rd_cost_z = _mm256_add_epi64(rd_cost_z, even);
     } else if (spt == SCAN_SOCSBB) {
       __m256i original = _mm256_loadu_si256((__m256i const*)state->m_sigFracBits[start]);
-      __m256i even_mask = _mm256_setr_epi32(0, 2, 4, 6, -1, -1, -1, -1);
-      __m256i odd_mask = _mm256_setr_epi32(1, 3, 5, 7, -1, -1, -1, -1);
-      __m256i even = _mm256_permutevar8x32_epi32(original, even_mask);
-      __m256i odd = _mm256_permutevar8x32_epi32(original, odd_mask);
-      __m256i m_sigFracBits_0 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(even, 0));
-      __m256i m_sigFracBits_1 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(odd, 0));
+
+      __m256i m_sigFracBits_0 = _mm256_and_si256(original, _mm256_set1_epi64x(0xffffffff));
+      __m256i m_sigFracBits_1 = _mm256_srli_epi64(original, 32);
 
       original = _mm256_loadu_si256((__m256i const*)state->m_sbbFracBits[start]);
-      odd = _mm256_permutevar8x32_epi32(original, odd_mask);
-      __m256i m_sbbFracBits_1 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(odd, 0));
+      __m256i m_sbbFracBits_1 = _mm256_srli_epi64(original, 32);
 
-      
       rd_cost_a = _mm256_add_epi64(rd_cost_a, m_sbbFracBits_1);
       rd_cost_b = _mm256_add_epi64(rd_cost_b, m_sbbFracBits_1);
       rd_cost_z = _mm256_add_epi64(rd_cost_z, m_sbbFracBits_1);
@@ -190,19 +181,17 @@ static void check_rd_costs_avx2(const all_depquant_states* const state, const en
       rd_cost_z = _mm256_add_epi64(rd_cost_z, m_sigFracBits_0);
     }
     else {
-      if (state->m_numSigSbb[start] && state->m_numSigSbb[start + 1] && state->m_numSigSbb[start + 2] && state->m_numSigSbb[start + 3]) {
+      int num_sig_sbb;
+      memcpy(&num_sig_sbb, &state->m_numSigSbb[start], 4);
+      if (num_sig_sbb == 0x01010101) {
         __m256i original = _mm256_loadu_si256((__m256i const*)state->m_sigFracBits[start]);
-        __m256i even_mask = _mm256_setr_epi32(0, 2, 4, 6, -1, -1, -1, -1);
-        __m256i odd_mask = _mm256_setr_epi32(1, 3, 5, 7, -1, -1, -1, -1);
-        __m256i even = _mm256_permutevar8x32_epi32(original, even_mask);
-        __m256i odd = _mm256_permutevar8x32_epi32(original, odd_mask);
-        __m256i even_64 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(even, 0));
-        __m256i odd_64 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(odd, 0));
-        rd_cost_a = _mm256_add_epi64(rd_cost_a, odd_64);
-        rd_cost_b = _mm256_add_epi64(rd_cost_b, odd_64);
-        rd_cost_z = _mm256_add_epi64(rd_cost_z, even_64);     
+      __m256i even      = _mm256_and_si256(original, _mm256_set1_epi64x(0xffffffff));
+      __m256i odd  = _mm256_srli_epi64(original, 32);
+        rd_cost_a = _mm256_add_epi64(rd_cost_a, odd);
+        rd_cost_b = _mm256_add_epi64(rd_cost_b, odd);
+        rd_cost_z = _mm256_add_epi64(rd_cost_z, even);     
       }
-      else if (!state->m_numSigSbb[start] && !state->m_numSigSbb[start + 1] && !state->m_numSigSbb[start + 2] && !state->m_numSigSbb[start + 3]) {
+      else if (num_sig_sbb == 0) {
         rd_cost_z = _mm256_setr_epi64x(decisions->rdCost[0], decisions->rdCost[0], decisions->rdCost[3], decisions->rdCost[3]);
       }
 
@@ -527,7 +516,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
       __m128i prev_state_with_ff_high_bytes = _mm_or_si128(prev_state, _mm_set1_epi32(0xffffff00));
       __m128i num_sig_sbb = _mm_load_si128((const __m128i*)state->m_numSigSbb);
       num_sig_sbb = _mm_shuffle_epi8(num_sig_sbb, prev_state_with_ff_high_bytes);
-      num_sig_sbb = _mm_add_epi32(
+      num_sig_sbb = _mm_or_si128(
         num_sig_sbb,
         _mm_min_epi32(abs_level, _mm_set1_epi32(1))
       );
@@ -552,7 +541,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
           memset(state->m_absLevelsAndCtxInit[curr_state_offset], 0, 16 * sizeof(uint8_t));
         } else if (decisions->prevId[decision_id] >= 0) {
           prev_state_s[i] = ctxs->m_prev_state_offset + decisions->prevId[decision_id];
-          state->m_numSigSbb[curr_state_offset] = state->m_numSigSbb[prev_state_s[i]] + !!decisions->absLevel[decision_id];
+          state->m_numSigSbb[curr_state_offset] = state->m_numSigSbb[prev_state_s[i]] || !!decisions->absLevel[decision_id];
           memcpy(state->m_absLevelsAndCtxInit[curr_state_offset], state->m_absLevelsAndCtxInit[prev_state_s[i]], 16 * sizeof(uint8_t));
         } else {
           state->m_numSigSbb[curr_state_offset] = 1;
@@ -591,7 +580,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
           memset(sbbFlags, 0, numSbb * sizeof(uint8_t));
           memset(levels + scan_pos, 0, setCpSize);
         }
-        sbbFlags[cg_pos] = !!ctxs->m_allStates.m_numSigSbb[curr_state + state_offset];
+        sbbFlags[cg_pos] = ctxs->m_allStates.m_numSigSbb[curr_state + state_offset];
         memcpy(levels + scan_pos, ctxs->m_allStates.m_absLevelsAndCtxInit[curr_state + state_offset], 16 * sizeof(uint8_t));
       }
 
@@ -996,7 +985,7 @@ static INLINE void update_states_avx2(
           );
           sum_num = _mm_add_epi32(
             sum_num,
-            _mm_min_epi32(_mm_and_si128(t, first_byte), ones));
+            _mm_min_epi32(t, ones));
         }
       case 4:
         {
@@ -1013,9 +1002,7 @@ static INLINE void update_states_avx2(
             sum_abs1,
             min_arg
           );
-          sum_num = _mm_add_epi32(
-            sum_num,
-            _mm_min_epi32(_mm_and_si128(t, first_byte), ones));
+          sum_num = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
         }
       case 3:
         {
@@ -1032,9 +1019,7 @@ static INLINE void update_states_avx2(
             sum_abs1,
             min_arg
           );
-          sum_num = _mm_add_epi32(
-            sum_num,
-            _mm_min_epi32(_mm_and_si128(t, first_byte), ones));
+          sum_num = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
         }
       case 2:
         {
@@ -1051,9 +1036,7 @@ static INLINE void update_states_avx2(
             sum_abs1,
             min_arg
           );
-          sum_num = _mm_add_epi32(
-            sum_num,
-            _mm_min_epi32(_mm_and_si128(t, first_byte), ones));
+          sum_num = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
         }
       case 1: {
           __m128i t = _mm_i32gather_epi32(
@@ -1069,9 +1052,7 @@ static INLINE void update_states_avx2(
             sum_abs1,
             min_arg
             );
-          sum_num = _mm_add_epi32(
-            sum_num,
-            _mm_min_epi32(_mm_and_si128(t, first_byte), ones));
+          sum_num = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
         } break;
       default:
           assert(0);
@@ -1161,6 +1142,7 @@ static INLINE void update_states_avx2(
     }
 
     else if (rem_reg_all_lt4) {
+      const __m128i first_byte = _mm_set1_epi32(0xff);
       uint8_t*       levels = (uint8_t*)state->m_absLevelsAndCtxInit[state_offset];
       const __m128i  last_two_bytes = _mm_set1_epi32(0xffff);
       const __m128i  last_byte = _mm_set1_epi32(0xff);
@@ -1173,21 +1155,23 @@ static INLINE void update_states_avx2(
         2);
       tinit = _mm_and_si128(tinit, last_two_bytes);
       __m128i sum_abs = _mm_srli_epi32(tinit, 8);
+      sum_abs         = _mm_min_epi32(sum_abs, _mm_set1_epi32(51));
       switch (numIPos) {
         case 5: {
           __m128i t = _mm_i32gather_epi32(
             (int*)levels,
             _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[4])),
             1);
-          t = _mm_and_si128(t, last_byte);
           sum_abs = _mm_add_epi32(sum_abs, t);
+          // Need this to make sure we don't go beyond 255
+          sum_abs = _mm_and_si128(sum_abs, first_byte);
+          sum_abs = _mm_min_epi32(sum_abs, _mm_set1_epi32(51));
         }
         case 4: {
           __m128i t = _mm_i32gather_epi32(
             (int*)levels,
             _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[3])),
             1);
-          t = _mm_and_si128(t, last_byte);
           sum_abs = _mm_add_epi32(sum_abs, t);
         }
         case 3: {
@@ -1195,7 +1179,6 @@ static INLINE void update_states_avx2(
             (int*)levels,
             _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[2])),
             1);
-          t = _mm_and_si128(t, last_byte);
           sum_abs = _mm_add_epi32(sum_abs, t);
         }
         case 2: {
@@ -1203,7 +1186,6 @@ static INLINE void update_states_avx2(
             (int*)levels,
             _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[1])),
             1);
-          t = _mm_and_si128(t, last_byte);
           sum_abs = _mm_add_epi32(sum_abs, t);
         }
         case 1: {
@@ -1211,12 +1193,12 @@ static INLINE void update_states_avx2(
             (int*)levels,
             _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[0])),
             1);
-          t = _mm_and_si128(t, last_byte);
           sum_abs = _mm_add_epi32(sum_abs, t);
         } break;
         default:
           assert(0);
       }
+      sum_abs = _mm_and_si128(sum_abs, last_byte);
       if (extRiceFlag) {
         assert(0 && "Not implemented for avx2");
       } else {
@@ -1229,10 +1211,8 @@ static INLINE void update_states_avx2(
 
         
         for (int i = 0; i < 4; ++i) {
-          state->m_goRiceZero[state_offset + i] = (i < 2 ? 1 : 2) << state->m_goRicePar[state_offset + i];
-          
+          state->m_goRiceZero[state_offset + i] = (i < 2 ? 1 : 2) << state->m_goRicePar[state_offset + i];          
         }
-
       }
 
     }

From cf6f03b73b4fa09c3ce5f84a98895d3fccd6e719 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 26 Apr 2023 14:41:04 +0300
Subject: [PATCH 231/254] [avx2] This has worked but I'm pretty sure these
 should be unaligned

---
 src/strategies/avx2/depquant-avx2.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/strategies/avx2/depquant-avx2.c b/src/strategies/avx2/depquant-avx2.c
index 0e5f35d1..601f04da 100644
--- a/src/strategies/avx2/depquant-avx2.c
+++ b/src/strategies/avx2/depquant-avx2.c
@@ -749,10 +749,10 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
       v_tmp[2] = _mm256_permute4x64_epi64(v_tmp16_hi[0], _MM_SHUFFLE(3, 1, 2, 0));
       v_tmp[3] = _mm256_permute4x64_epi64(v_tmp16_hi[1], _MM_SHUFFLE(3, 1, 2, 0));
 
-      _mm256_store_si256((__m256i*)(state->m_absLevelsAndCtxInit[state_offset] + 8),  _mm256_permute2x128_si256(v_tmp[0], v_tmp[1], 0x20));
-      _mm256_store_si256((__m256i*)(state->m_absLevelsAndCtxInit[state_offset + 1] + 8),  _mm256_permute2x128_si256(v_tmp[0], v_tmp[1], 0x31));
-      _mm256_store_si256((__m256i*)(state->m_absLevelsAndCtxInit[state_offset + 2] + 8),  _mm256_permute2x128_si256(v_tmp[2], v_tmp[3], 0x20));
-      _mm256_store_si256((__m256i*)(state->m_absLevelsAndCtxInit[state_offset + 3] + 8),  _mm256_permute2x128_si256(v_tmp[2], v_tmp[3], 0x31));
+      _mm256_storeu_si256((__m256i*)(state->m_absLevelsAndCtxInit[state_offset] + 8),  _mm256_permute2x128_si256(v_tmp[0], v_tmp[1], 0x20));
+      _mm256_storeu_si256((__m256i*)(state->m_absLevelsAndCtxInit[state_offset + 1] + 8),  _mm256_permute2x128_si256(v_tmp[0], v_tmp[1], 0x31));
+      _mm256_storeu_si256((__m256i*)(state->m_absLevelsAndCtxInit[state_offset + 2] + 8),  _mm256_permute2x128_si256(v_tmp[2], v_tmp[3], 0x20));
+      _mm256_storeu_si256((__m256i*)(state->m_absLevelsAndCtxInit[state_offset + 3] + 8),  _mm256_permute2x128_si256(v_tmp[2], v_tmp[3], 0x31));
 
       for (int i = 0; i < 4; ++i) {
         memset(state->m_absLevelsAndCtxInit[state_offset + i], 0, 16);

From dda972c665edc9649590159e570d6c0adc41e0e8 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Fri, 28 Apr 2023 15:55:55 +0300
Subject: [PATCH 232/254] [avx2] Try to do lnz decision with avx2

---
 CMakeLists.txt                            |  2 +-
 src/dep_quant.c                           | 66 +++++++++++++++++---
 src/strategies/avx2/depquant-avx2.c       | 73 +++++++++++++++++++++++
 src/strategies/generic/depquant-generic.c | 18 +++++-
 src/strategies/strategies-depquant.c      |  1 +
 src/strategies/strategies-depquant.h      | 11 ++++
 6 files changed, 160 insertions(+), 11 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d8c37bbc..cafb8fd8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -143,7 +143,7 @@ target_include_directories(uvg266 PUBLIC src)
 target_include_directories(uvg266 PUBLIC src/extras)
 target_include_directories(uvg266 PUBLIC src/strategies)
 
-file(GLOB LIB_SOURCES_STRATEGIES_AVX2 RELATIVE ${PROJECT_SOURCE_DIR} "src/strategies/avx2/*.c")
+file(GLOB LIB_SOURCES_STRATEGIES_AVX2 RELATIVE ${PROJECT_SOURCE_DIR} "src/strategies/avx2/*.c" "src/dep_quant.c")
 file(GLOB LIB_SOURCES_STRATEGIES_SSE41 RELATIVE ${PROJECT_SOURCE_DIR} "src/strategies/sse41/*.c")
 file(GLOB LIB_SOURCES_STRATEGIES_SSE42 RELATIVE ${PROJECT_SOURCE_DIR} "src/strategies/sse42/*.c")
 
diff --git a/src/dep_quant.c b/src/dep_quant.c
index 99a15df3..87799e35 100644
--- a/src/dep_quant.c
+++ b/src/dep_quant.c
@@ -32,6 +32,8 @@
 
 #include "dep_quant.h"
 
+#include <immintrin.h>
+
 #include "cu.h"
 #include "encoderstate.h"
 #include "intra.h"
@@ -804,7 +806,6 @@ void uvg_dep_quant_update_state(
   }
 }
 
-static bool same[13];
 
 int uvg_dep_quant(
   const encoder_state_t* const state,
@@ -889,14 +890,63 @@ int uvg_dep_quant(
     height >= 4) {
     firstTestPos =((width == 4 && height == 4) || (width == 8 && height == 8)) ? 7 : 15;
   }
-  const int32_t default_quant_coeff = dep_quant_context.m_quant->m_QScale;
-  const int32_t thres               = dep_quant_context.m_quant->m_thresLast;
-  for (; firstTestPos >= 0; firstTestPos--) {
-    coeff_t thresTmp = (enableScalingLists) ? (thres / (4 * q_coeff[scan[firstTestPos]])) : (thres / (4 * default_quant_coeff));
-    if (abs(srcCoeff[scan[firstTestPos]]) > thresTmp) {
-      break;
+  //uvg_find_first_non_zero_coeff(srcCoeff, enableScalingLists, dep_quant_context, scan, q_coeff, &firstTestPos, width, height);
+  const int     default_quant_coeff = dep_quant_context.m_quant->m_QScale;
+  const int32_t thres = dep_quant_context.m_quant->m_thresLast;
+  int           temp = firstTestPos;
+  if (enableScalingLists) {
+    for (; temp >= 0; (temp)--) {
+      coeff_t thresTmp = thres / (4 * q_coeff[scan[(temp)]]);
+      if (abs(srcCoeff[scan[(temp)]]) > thresTmp) {
+        break;
+      }
+    }
+  } else {
+    coeff_t thresTmp = thres / (4 * default_quant_coeff);
+    if (temp >= 16 && height >= 4) {
+      __m256i th = _mm256_set1_epi16(thresTmp);
+      temp -= 15;
+      for (; temp >= 0; temp -= 16) {
+        __m256i sbb_data;
+        if (width <= 4) {
+          sbb_data = _mm256_loadu_si256((__m256i const*)&srcCoeff[scan[temp]]);
+        } else if (width == 8) {
+          uint32_t i = scan[temp];
+          __m256i  first = _mm256_loadu_si256((__m256i const*)&srcCoeff[i]);
+          __m256i  second = _mm256_loadu_si256((__m256i const*)&srcCoeff[i + 12]);
+          sbb_data = _mm256_blend_epi32(first, second, 204);
+        } else {
+          int16_t  temp_d[16];
+          uint32_t i = scan[temp];
+          memcpy(temp_d, &srcCoeff[i], 8);
+          i += width;
+          memcpy(temp_d + 4, &srcCoeff[i], 8);
+          i += width;
+          memcpy(temp_d + 8, &srcCoeff[i], 8);
+          i += width;
+          memcpy(temp_d + 12, &srcCoeff[i], 8);
+
+          sbb_data = _mm256_loadu_si256((__m256i const*)temp_d);
+        }
+        sbb_data = _mm256_abs_epi16(sbb_data);
+
+        __m256i a = _mm256_cmpgt_epi16(sbb_data, th);
+        if (!_mm256_testz_si256(a, a)) {
+          if (temp >= 0) {
+            temp += 15;
+          }
+          break;
+        }
+      }
+    }
+    for (; temp >= 0; temp--) {
+      if (abs(srcCoeff[scan[(temp)]]) > thresTmp) {
+        break;
+      }
     }
   }
+
+  firstTestPos = temp;
   if (firstTestPos < 0) {
     return 0;
   }
@@ -961,7 +1011,7 @@ int uvg_dep_quant(
 
   const uint32_t height_in_sbb = MAX(height >> 2, 1);
   const uint32_t width_in_sbb = MAX(width >> 2, 1);
-
+  
   //===== populate trellis =====
   for (int scanIdx = firstTestPos; scanIdx >= 0; scanIdx--) {
     uint32_t blkpos = scan[scanIdx];
diff --git a/src/strategies/avx2/depquant-avx2.c b/src/strategies/avx2/depquant-avx2.c
index 601f04da..51f4e7d8 100644
--- a/src/strategies/avx2/depquant-avx2.c
+++ b/src/strategies/avx2/depquant-avx2.c
@@ -37,6 +37,8 @@
 #include "strategies/avx2/depquant-avx2.h"
 #include "strategyselector.h"
 
+#define COMPILE_INTEL_AVX2 1
+
 #if COMPILE_INTEL_AVX2 && defined X86_64
 #include "dep_quant.h"
 
@@ -1359,6 +1361,76 @@ void uvg_dep_quant_decide_and_update_avx2(
 }
 
 
+void uvg_find_first_non_zero_avx2(
+  const coeff_t* srcCoeff,
+  const bool enableScalingLists,
+  context_store dep_quant_context, 
+  const uint32_t* const scan,
+  const int32_t* q_coeff,
+  int* firstTestPos, 
+  const int width,
+  const int height)
+{
+  const int default_quant_coeff = dep_quant_context.m_quant->m_QScale;
+  const int32_t thres  = dep_quant_context.m_quant->m_thresLast;
+  int temp = *firstTestPos;
+  if (enableScalingLists) {
+    for (; temp >= 0; (temp)--) {
+      coeff_t thresTmp = thres / (4 * q_coeff[scan[(temp)]]);
+      if (abs(srcCoeff[scan[(temp)]]) > thresTmp) {
+        break;
+      }
+    }
+  } else {
+    coeff_t thresTmp = thres / (4 * default_quant_coeff);
+    if (temp >= 16 && height >= 4) {
+      __m256i th = _mm256_set1_epi16(thresTmp);
+      temp -= 15;
+      for (; temp >= 0; temp -= 16) {
+        __m256i sbb_data;
+        if (width <= 4) {
+          sbb_data = _mm256_loadu_si256((__m256i const*)&srcCoeff[scan[temp]]);
+        } else if (width == 8) {
+          uint32_t i     = scan[temp];
+          __m256i  first = _mm256_loadu_si256((__m256i const*)&srcCoeff[i]);
+          __m256i  second = _mm256_loadu_si256((__m256i const*)&srcCoeff[i+ 12]);
+          sbb_data       = _mm256_blend_epi32(first, second, 204);
+        } else {
+          int16_t temp_d[16];
+          uint32_t i = scan[temp];
+          memcpy(temp_d, &srcCoeff[i], 8);
+          i += width;
+          memcpy(temp_d + 4, &srcCoeff[i], 8);
+          i += width;
+          memcpy(temp_d + 8, &srcCoeff[i], 8);
+          i += width;
+          memcpy(temp_d + 12, &srcCoeff[i], 8);
+
+          sbb_data = _mm256_loadu_si256((__m256i const*)temp_d);
+        }
+        sbb_data = _mm256_abs_epi16(sbb_data);
+
+        __m256i a = _mm256_cmpgt_epi16(sbb_data, th);
+        if (!_mm256_testz_si256(a, a))
+        {
+          if (temp >= 0) {
+            temp += 15;
+          }
+          break;
+        }
+      }
+    }
+    for (;temp >= 0; temp--) {
+      if (abs(srcCoeff[scan[(temp)]]) > thresTmp) {
+        break;
+      }
+    }
+  }
+
+  *firstTestPos = temp;
+}
+
+
 #endif //COMPILE_INTEL_AVX2 && defined X86_64
 
 int uvg_strategy_register_depquant_avx2(void* opaque, uint8_t bitdepth)
@@ -1367,6 +1439,7 @@ int uvg_strategy_register_depquant_avx2(void* opaque, uint8_t bitdepth)
 
 #if COMPILE_INTEL_AVX2 && defined X86_64
   success &= uvg_strategyselector_register(opaque, "dep_quant_decide_and_update", "avx2", 40, &uvg_dep_quant_decide_and_update_avx2);
+  success &= uvg_strategyselector_register(opaque, "find_first_non_zero_coeff", "avx2", 40, &uvg_find_first_non_zero_avx2);
 #endif //COMPILE_INTEL_AVX2 && defined X86_64
 
   return success;
diff --git a/src/strategies/generic/depquant-generic.c b/src/strategies/generic/depquant-generic.c
index aa2ea99e..f1103054 100644
--- a/src/strategies/generic/depquant-generic.c
+++ b/src/strategies/generic/depquant-generic.c
@@ -227,12 +227,26 @@ static void uvg_dep_quant_decide_and_update_generic(
 }
 
 
+void uvg_find_first_non_zero_generic(const coeff_t* srcCoeff, const bool enableScalingLists, context_store dep_quant_context, const uint32_t* const scan, const int32_t* q_coeff, int* firstTestPos, int width, int height)
+{
+  const int default_quant_coeff = dep_quant_context.m_quant->m_QScale;
+  const int32_t thres  = dep_quant_context.m_quant->m_thresLast;
+  int temp = *firstTestPos;
+  for (; temp >= 0; (temp)--) {
+    coeff_t thresTmp = (enableScalingLists) ? (thres / (4 * q_coeff[scan[(temp)]])) : (thres / (4 * default_quant_coeff));
+    if (abs(srcCoeff[scan[(temp)]]) > thresTmp) {
+      break;
+    }
+  }
+  *firstTestPos = temp;
+}
+
 int uvg_strategy_register_depquant_generic(void* opaque, uint8_t bitdepth)
 {
   bool success = true;
   
-  success &= uvg_strategyselector_register(opaque, "dep_quant_decide_and_update", "generic", 40, &uvg_dep_quant_decide_and_update_generic);
-
+  success &= uvg_strategyselector_register(opaque, "dep_quant_decide_and_update", "generic", 0, &uvg_dep_quant_decide_and_update_generic);
+  success &= uvg_strategyselector_register(opaque, "find_first_non_zero_coeff", "generic", 0, &uvg_find_first_non_zero_generic);
 
   return success;
 }
diff --git a/src/strategies/strategies-depquant.c b/src/strategies/strategies-depquant.c
index 7ba62163..d0eac087 100644
--- a/src/strategies/strategies-depquant.c
+++ b/src/strategies/strategies-depquant.c
@@ -39,6 +39,7 @@
 
 // Define function pointers.
 dep_quant_decide_and_update_func* uvg_dep_quant_decide_and_update;
+find_first_non_zero_coeff_func* uvg_find_first_non_zero_coeff;
 
 
 int uvg_strategy_register_depquant(void *opaque, uint8_t bitdepth)
diff --git a/src/strategies/strategies-depquant.h b/src/strategies/strategies-depquant.h
index 4021c458..6a49dc35 100644
--- a/src/strategies/strategies-depquant.h
+++ b/src/strategies/strategies-depquant.h
@@ -61,16 +61,27 @@ typedef int(dep_quant_decide_and_update_func)(
   const uint32_t                          effHeight,
   bool                                    is_chroma);
 
+typedef void(find_first_non_zero_coeff_func)(
+  const coeff_t*        srcCoeff,
+  const bool            enableScalingLists,
+  context_store         dep_quant_context,
+  const uint32_t* const scan,
+  const int32_t*        q_coeff,
+  int*                  firstTestPos,
+  int                   width,
+  int                   height);
 
 
 // Declare function pointers.
 extern dep_quant_decide_and_update_func* uvg_dep_quant_decide_and_update;
+extern find_first_non_zero_coeff_func* uvg_find_first_non_zero_coeff;
 
 int uvg_strategy_register_depquant(void* opaque, uint8_t bitdepth);
 
 
 #define STRATEGIES_DEPQUANT_EXPORTS \
   {"dep_quant_decide_and_update", (void**)&uvg_dep_quant_decide_and_update}, \
+  {"find_first_non_zero_coeff", (void**)&uvg_find_first_non_zero_coeff}, \
 
 
 

From a624988c912533188abbcc05dc10f8055be17a9a Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Fri, 5 May 2023 14:15:05 +0300
Subject: [PATCH 233/254] [dep_quant] Separate abs levels and ctx init

---
 src/dep_quant.c                     | 30 +++++++------
 src/dep_quant.h                     |  7 ++-
 src/strategies/avx2/depquant-avx2.c | 68 +++++++++++++++--------------
 3 files changed, 54 insertions(+), 51 deletions(-)

diff --git a/src/dep_quant.c b/src/dep_quant.c
index 87799e35..b970d510 100644
--- a/src/dep_quant.c
+++ b/src/dep_quant.c
@@ -583,7 +583,7 @@ static INLINE void update_common_context(
     memset(levels + scan_pos, 0, setCpSize);
   }
   sbbFlags[cg_pos] = !!ctxs->m_allStates.m_numSigSbb[curr_state];
-  memcpy(levels + scan_pos, ctxs->m_allStates.m_absLevelsAndCtxInit[curr_state], 16 * sizeof(uint8_t));
+  memcpy(levels + scan_pos, ctxs->m_allStates.m_absLevels[curr_state], 16 * sizeof(uint8_t));
 
   const int       sigNSbb = ((next_sbb_right ? sbbFlags[next_sbb_right] : false) || (next_sbb_below ? sbbFlags[next_sbb_below] : false) ? 1 : 0);
   ctxs->m_allStates.m_numSigSbb[curr_state] = 0;
@@ -600,7 +600,7 @@ static INLINE void update_common_context(
   ctxs->m_allStates.m_sbbFracBits[curr_state][0] = cc->m_sbbFlagBits[sigNSbb][0];
   ctxs->m_allStates.m_sbbFracBits[curr_state][1] = cc->m_sbbFlagBits[sigNSbb][1];
 
-  uint16_t *templateCtxInit = ctxs->m_allStates.m_absLevelsAndCtxInit[curr_state] + 8;
+  uint16_t *templateCtxInit = ctxs->m_allStates.m_ctxInit[curr_state];
   const int scanBeg = scan_pos - 16;
   const NbInfoOut* nbOut = cc->m_nbInfo + scanBeg;
   const uint8_t* absLevels = levels + scanBeg;
@@ -628,7 +628,7 @@ static INLINE void update_common_context(
       templateCtxInit[id] = 0;
     }
   }
-  memset(ctxs->m_allStates.m_absLevelsAndCtxInit[curr_state], 0, 16 * sizeof(uint8_t));
+  memset(ctxs->m_allStates.m_absLevels[curr_state], 0, 16 * sizeof(uint8_t));
 }
 
 
@@ -654,24 +654,24 @@ void uvg_dep_quant_update_state_eos(
     if (decisions->prevId[decision_id] >= 4) {
       prvState = ctxs->m_skip_state_offset + (decisions->prevId[decision_id] - 4);
       state->m_numSigSbb[curr_state_offset] = 0;
-      memset(state->m_absLevelsAndCtxInit[curr_state_offset], 0, 16 * sizeof(uint8_t));
+      memset(state->m_absLevels[curr_state_offset], 0, 16 * sizeof(uint8_t));
     }
     else if (decisions->prevId[decision_id] >= 0) {
       prvState = ctxs->m_prev_state_offset + decisions->prevId[decision_id];
       state->m_numSigSbb[curr_state_offset] = state->m_numSigSbb[prvState] || !!decisions->absLevel[decision_id];
-      memcpy(state->m_absLevelsAndCtxInit[curr_state_offset], state->m_absLevelsAndCtxInit[prvState], 16 * sizeof(uint8_t));
+      memcpy(state->m_absLevels[curr_state_offset], state->m_absLevels[prvState], 16 * sizeof(uint8_t));
     }
     else {
       state->m_numSigSbb[curr_state_offset] = 1;
-      memset(state->m_absLevelsAndCtxInit[curr_state_offset], 0, 16 * sizeof(uint8_t));
+      memset(state->m_absLevels[curr_state_offset], 0, 16 * sizeof(uint8_t));
     }
-    uint8_t* temp = (uint8_t*)(&state->m_absLevelsAndCtxInit[curr_state_offset][scan_pos & 15]);
+    uint8_t* temp = &state->m_absLevels[curr_state_offset][scan_pos & 15];
     *temp = (uint8_t)MIN(51, decisions->absLevel[decision_id]);
 
     update_common_context(ctxs, state->m_commonCtx, scan_pos, cg_pos, width_in_sbb, height_in_sbb, next_sbb_right,
                           next_sbb_below, prvState, ctxs->m_curr_state_offset + decision_id);
 
-    coeff_t tinit = state->m_absLevelsAndCtxInit[curr_state_offset][8 + ((scan_pos - 1) & 15)];
+    coeff_t tinit = state->m_ctxInit[curr_state_offset][((scan_pos - 1) & 15)];
     coeff_t sumNum = tinit & 7;
     coeff_t sumAbs1 = (tinit >> 3) & 31;
     coeff_t sumGt1 = sumAbs1 - sumNum;
@@ -712,7 +712,8 @@ void uvg_dep_quant_update_state(
                                             ? (unsigned)decisions->absLevel[decision_id]
                                             : 3);
       }
-      memcpy(state->m_absLevelsAndCtxInit[state_id], state->m_absLevelsAndCtxInit[prvState], 48 * sizeof(uint8_t));
+      memcpy(state->m_absLevels[state_id], state->m_absLevels[prvState], 16 * sizeof(uint8_t));
+      memcpy(state->m_ctxInit[state_id], state->m_ctxInit[prvState], 16 * sizeof(uint16_t));
     }
     else {
       state->m_numSigSbb[state_id] = 1;
@@ -721,15 +722,16 @@ void uvg_dep_quant_update_state(
       //(scanInfo.chType == CHANNEL_TYPE_LUMA) ? MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_LUMA : MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_CHROMA;
       state->m_remRegBins[state_id] = (state->effWidth * state->effHeight * ctxBinSampleRatio) / 16 - (
         decisions->absLevel[decision_id] < 2 ? (unsigned)decisions->absLevel[decision_id] : 3);
-      memset(state->m_absLevelsAndCtxInit[state_id], 0, 48 * sizeof(uint8_t));
+      memset(state->m_absLevels[state_id], 0, 16 * sizeof(uint8_t));
+      memset(state->m_ctxInit[state_id], 0, 16 * sizeof(uint16_t));
     }
     state->all_gte_four &= state->m_remRegBins[state_id] >= 4;
     state->all_lt_four &= state->m_remRegBins[state_id] < 4;
-    uint8_t* levels = (uint8_t*)(state->m_absLevelsAndCtxInit[state_id]);
+    uint8_t* levels = state->m_absLevels[state_id];
     levels[scan_pos & 15] = (uint8_t)MIN(32, decisions->absLevel[decision_id]);
 
     if (state->m_remRegBins[state_id] >= 4) {
-      coeff_t tinit = state->m_absLevelsAndCtxInit[state_id][8 + ((scan_pos - 1) & 15)];
+      coeff_t tinit = state->m_ctxInit[state_id][((scan_pos - 1) & 15)];
       coeff_t sumAbs1 = (tinit >> 3) & 31;
       coeff_t sumNum = tinit & 7;
 #define UPDATE(k) {coeff_t t=levels[next_nb_info_ssb.inPos[k]]; sumAbs1+=MIN(4+(t&1),t); sumNum+=!!t; }
@@ -751,7 +753,7 @@ void uvg_dep_quant_update_state(
              sizeof(state->m_coeffFracBits[0]));
 
 
-      coeff_t sumAbs = state->m_absLevelsAndCtxInit[state_id][8 + ((scan_pos - 1) & 15)] >> 8;
+      coeff_t sumAbs = state->m_ctxInit[state_id][(scan_pos - 1) & 15] >> 8;
 #define UPDATE(k) {coeff_t t=levels[next_nb_info_ssb.inPos[k]]; sumAbs+=t; }
       switch (numIPos) {
         case 5: UPDATE(4);
@@ -775,7 +777,7 @@ void uvg_dep_quant_update_state(
       }
     }
     else {
-      coeff_t sumAbs = (state->m_absLevelsAndCtxInit[state_id][8 + ((scan_pos - 1) & 15)]) >> 8;
+      coeff_t sumAbs = (state->m_ctxInit[state_id][(scan_pos - 1) & 15]) >> 8;
 #define UPDATE(k) {coeff_t t=levels[next_nb_info_ssb.inPos[k]]; sumAbs+=t; }
       switch (numIPos) {
         case 5: UPDATE(4);
diff --git a/src/dep_quant.h b/src/dep_quant.h
index 676d1bab..45220706 100644
--- a/src/dep_quant.h
+++ b/src/dep_quant.h
@@ -130,8 +130,7 @@ typedef struct {
 
 typedef struct {
   int64_t  m_rdCost;
-  uint16_t m_absLevelsAndCtxInit
-    [24]; // 16x8bit for abs levels + 16x16bit for ctx init id
+  uint16_t m_absLevelsAndCtxInit[24]; // 16x8bit for abs levels + 16x16bit for ctx init id
   int8_t          m_numSigSbb;
   int             m_remRegBins;
   int8_t          m_refSbbCtxId;
@@ -150,8 +149,8 @@ typedef struct {
 } depquant_state;
 typedef struct {
   int64_t  ALIGNED(32) m_rdCost[12];
-  uint16_t ALIGNED(32) m_absLevelsAndCtxInit
-    [12][24]; // 16x8bit for abs levels + 16x16bit for ctx init id
+  uint8_t  ALIGNED(32) m_absLevels[12][16]; 
+  uint16_t ALIGNED(32) m_ctxInit[12][16]; 
   int8_t          ALIGNED(16) m_numSigSbb[12];
   int             ALIGNED(32) m_remRegBins[12];
   int8_t          ALIGNED(16) m_refSbbCtxId[12];
diff --git a/src/strategies/avx2/depquant-avx2.c b/src/strategies/avx2/depquant-avx2.c
index 51f4e7d8..fb53713d 100644
--- a/src/strategies/avx2/depquant-avx2.c
+++ b/src/strategies/avx2/depquant-avx2.c
@@ -506,7 +506,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
       );
       memset(&state->m_numSigSbb[state_offset], 0, 4);
       for (int i = 0; i < 4; ++i) {
-        memset(state->m_absLevelsAndCtxInit[state_offset + i], 0, 16 * sizeof(uint8_t));    
+        memset(state->m_absLevels[state_offset + i], 0, 16 * sizeof(uint8_t));    
       }
     } else if (all_between_zero_and_three) {
       prev_state_no_offset = _mm_set1_epi32(ctxs->m_prev_state_offset);
@@ -530,7 +530,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
       int32_t prev_state_scalar[4];
       _mm_storeu_si128((__m128i*)prev_state_scalar, prev_state);
       for (int i = 0; i < 4; ++i) {
-        memcpy(state->m_absLevelsAndCtxInit[state_offset + i], state->m_absLevelsAndCtxInit[prev_state_scalar[i]], 16 * sizeof(uint8_t));
+        memcpy(state->m_absLevels[state_offset + i], state->m_absLevels[prev_state_scalar[i]], 16 * sizeof(uint8_t));
       }
     } else {
       int prev_state_s[4] = {-1, -1, -1, -1};
@@ -540,14 +540,14 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
         if (decisions->prevId[decision_id] >= 4) {
           prev_state_s[i] = ctxs->m_skip_state_offset + (decisions->prevId[decision_id] - 4);
           state->m_numSigSbb[curr_state_offset] = 0;
-          memset(state->m_absLevelsAndCtxInit[curr_state_offset], 0, 16 * sizeof(uint8_t));
+          memset(state->m_absLevels[curr_state_offset], 0, 16 * sizeof(uint8_t));
         } else if (decisions->prevId[decision_id] >= 0) {
           prev_state_s[i] = ctxs->m_prev_state_offset + decisions->prevId[decision_id];
           state->m_numSigSbb[curr_state_offset] = state->m_numSigSbb[prev_state_s[i]] || !!decisions->absLevel[decision_id];
-          memcpy(state->m_absLevelsAndCtxInit[curr_state_offset], state->m_absLevelsAndCtxInit[prev_state_s[i]], 16 * sizeof(uint8_t));
+          memcpy(state->m_absLevels[curr_state_offset], state->m_absLevels[prev_state_s[i]], 16 * sizeof(uint8_t));
         } else {
           state->m_numSigSbb[curr_state_offset] = 1;
-          memset(state->m_absLevelsAndCtxInit[curr_state_offset], 0, 16 * sizeof(uint8_t));
+          memset(state->m_absLevels[curr_state_offset], 0, 16 * sizeof(uint8_t));
           all_have_previous_state = false;
         }
       }
@@ -558,7 +558,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
     uint32_t max_abs_s[4];
     _mm_storeu_si128((__m128i*)max_abs_s, max_abs);
     for (int i = 0; i < 4; ++i) {
-      uint8_t* levels = (uint8_t*)state->m_absLevelsAndCtxInit[state_offset + i];
+      uint8_t* levels      = (uint8_t*)state->m_absLevels[state_offset + i];
       levels[level_offset] = max_abs_s[i];
     }
 
@@ -583,7 +583,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
           memset(levels + scan_pos, 0, setCpSize);
         }
         sbbFlags[cg_pos] = ctxs->m_allStates.m_numSigSbb[curr_state + state_offset];
-        memcpy(levels + scan_pos, ctxs->m_allStates.m_absLevelsAndCtxInit[curr_state + state_offset], 16 * sizeof(uint8_t));
+        memcpy(levels + scan_pos, ctxs->m_allStates.m_absLevels[curr_state + state_offset], 16 * sizeof(uint8_t));
       }
 
       __m128i sbb_offsets = _mm_set_epi32(3 * numSbb, 2 * numSbb, 1 * numSbb, 0);
@@ -751,13 +751,13 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
       v_tmp[2] = _mm256_permute4x64_epi64(v_tmp16_hi[0], _MM_SHUFFLE(3, 1, 2, 0));
       v_tmp[3] = _mm256_permute4x64_epi64(v_tmp16_hi[1], _MM_SHUFFLE(3, 1, 2, 0));
 
-      _mm256_storeu_si256((__m256i*)(state->m_absLevelsAndCtxInit[state_offset] + 8),  _mm256_permute2x128_si256(v_tmp[0], v_tmp[1], 0x20));
-      _mm256_storeu_si256((__m256i*)(state->m_absLevelsAndCtxInit[state_offset + 1] + 8),  _mm256_permute2x128_si256(v_tmp[0], v_tmp[1], 0x31));
-      _mm256_storeu_si256((__m256i*)(state->m_absLevelsAndCtxInit[state_offset + 2] + 8),  _mm256_permute2x128_si256(v_tmp[2], v_tmp[3], 0x20));
-      _mm256_storeu_si256((__m256i*)(state->m_absLevelsAndCtxInit[state_offset + 3] + 8),  _mm256_permute2x128_si256(v_tmp[2], v_tmp[3], 0x31));
+      _mm256_storeu_si256((__m256i*)(state->m_ctxInit[state_offset]),  _mm256_permute2x128_si256(v_tmp[0], v_tmp[1], 0x20));
+      _mm256_storeu_si256((__m256i*)(state->m_ctxInit[state_offset + 1]),  _mm256_permute2x128_si256(v_tmp[0], v_tmp[1], 0x31));
+      _mm256_storeu_si256((__m256i*)(state->m_ctxInit[state_offset + 2]),  _mm256_permute2x128_si256(v_tmp[2], v_tmp[3], 0x20));
+      _mm256_storeu_si256((__m256i*)(state->m_ctxInit[state_offset + 3]),  _mm256_permute2x128_si256(v_tmp[2], v_tmp[3], 0x31));
 
       for (int i = 0; i < 4; ++i) {
-        memset(state->m_absLevelsAndCtxInit[state_offset + i], 0, 16);
+        memset(state->m_absLevels[state_offset + i], 0, 16);
       }
     }
 
@@ -887,7 +887,8 @@ static INLINE void update_states_avx2(
       int32_t prv_states_scalar[4];
       _mm_storeu_si128((__m128i*)prv_states_scalar, prv_states);
       for (int i = 0; i < 4; ++i) {
-        memcpy(state->m_absLevelsAndCtxInit[state_offset + i], state->m_absLevelsAndCtxInit[prv_states_scalar[i]], 48 * sizeof(uint8_t));        
+        memcpy(state->m_absLevels[state_offset + i], state->m_absLevels[prv_states_scalar[i]], 16 * sizeof(uint8_t));        
+        memcpy(state->m_ctxInit[state_offset + i], state->m_ctxInit[prv_states_scalar[i]], 16 * sizeof(uint16_t));        
       }
     }
     else if (all_minus_one) {
@@ -912,7 +913,8 @@ static INLINE void update_states_avx2(
       bit_mask = _mm_movemask_epi8(mask);
       rem_reg_all_lt4 = (bit_mask == 0xFFFF);
       
-      memset(state->m_absLevelsAndCtxInit[state_offset], 0, 48 * sizeof(uint8_t) * 4);
+      memset(state->m_absLevels[state_offset], 0, 16 * sizeof(uint8_t) * 4);
+      memset(state->m_ctxInit[state_offset], 0, 16 * sizeof(uint16_t) * 4);
       
     }
     else {
@@ -930,14 +932,16 @@ static INLINE void update_states_avx2(
           if (state->m_remRegBins[state_id] >= 4) {
             state->m_remRegBins[state_id] -= (decisions->absLevel[decision_id] < 2 ? (unsigned)decisions->absLevel[decision_id] : 3);
           }
-          memcpy(state->m_absLevelsAndCtxInit[state_id], state->m_absLevelsAndCtxInit[prvState], 48 * sizeof(uint8_t));
+          memcpy(state->m_absLevels[state_id], state->m_absLevels[prvState], 16 * sizeof(uint8_t));
+          memcpy(state->m_ctxInit[state_id], state->m_ctxInit[prvState], 16 * sizeof(uint16_t));
         } else {
           state->m_numSigSbb[state_id] = 1;
           state->m_refSbbCtxId[state_id] = -1;
           int ctxBinSampleRatio = 28;
           //(scanInfo.chType == CHANNEL_TYPE_LUMA) ? MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_LUMA : MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_CHROMA;
           state->m_remRegBins[state_id] = (state->effWidth * state->effHeight * ctxBinSampleRatio) / 16 - (decisions->absLevel[decision_id] < 2 ? (unsigned)decisions->absLevel[decision_id] : 3);
-          memset(state->m_absLevelsAndCtxInit[state_id], 0, 48 * sizeof(uint8_t));
+          memset(state->m_absLevels[state_id], 0, 16 * sizeof(uint8_t));
+          memset(state->m_ctxInit[state_id], 0, 16 * sizeof(uint16_t));
         }
         rem_reg_all_gte_4 &= state->m_remRegBins[state_id] >= 4;
         rem_reg_all_lt4 &= state->m_remRegBins[state_id] < 4;
@@ -948,7 +952,7 @@ static INLINE void update_states_avx2(
     uint32_t max_abs_s[4];
     _mm_storeu_si128((__m128i*)max_abs_s, max_abs);
     for (int i = 0; i < 4; ++i) {
-      uint8_t* levels = (uint8_t*)state->m_absLevelsAndCtxInit[state_offset + i];
+      uint8_t* levels = (uint8_t*)state->m_absLevels[state_offset + i];
       levels[level_offset] = max_abs_s[i];
     }
     state->all_gte_four = rem_reg_all_gte_4;
@@ -957,18 +961,17 @@ static INLINE void update_states_avx2(
       const __m128i  first_two_bytes = _mm_set1_epi32(0xffff);
       const __m128i  first_byte = _mm_set1_epi32(0xff);
       const __m128i  ones = _mm_set1_epi32(1);
-      const uint32_t tinit_offset = MIN(level_offset - 1u, 15u) + 8;
-      const __m128i levels_start_offsets = _mm_set_epi32(48 * 3, 48 * 2, 48 * 1, 48 * 0);
-      const __m128i ctx_start_offsets = _mm_srli_epi32(levels_start_offsets, 1);
+      const uint32_t tinit_offset = MIN(level_offset - 1u, 15u);
+      const __m128i levels_start_offsets = _mm_set_epi32(16 * 3, 16 * 2, 16 * 1, 16 * 0);
       __m128i        tinit = _mm_i32gather_epi32(
-        (int *)state->m_absLevelsAndCtxInit[state_offset],
-        _mm_add_epi32(ctx_start_offsets, _mm_set1_epi32(tinit_offset)),
+        (int *)state->m_ctxInit[state_offset],
+        _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(tinit_offset)),
         2);
       tinit = _mm_and_si128(tinit, first_two_bytes);
       __m128i sum_abs1 = _mm_and_si128(_mm_srli_epi32(tinit, 3), _mm_set1_epi32(31));
       __m128i sum_num = _mm_and_si128(tinit, _mm_set1_epi32(7));
 
-      uint8_t* levels = (uint8_t*)state->m_absLevelsAndCtxInit[state_offset];
+      uint8_t* levels = (uint8_t*)state->m_absLevels[state_offset];
       switch (numIPos) {
       case 5:
         {
@@ -1145,15 +1148,14 @@ static INLINE void update_states_avx2(
 
     else if (rem_reg_all_lt4) {
       const __m128i first_byte = _mm_set1_epi32(0xff);
-      uint8_t*       levels = (uint8_t*)state->m_absLevelsAndCtxInit[state_offset];
+      uint8_t*       levels = (uint8_t*)state->m_absLevels[state_offset];
       const __m128i  last_two_bytes = _mm_set1_epi32(0xffff);
       const __m128i  last_byte = _mm_set1_epi32(0xff);
-      const uint32_t tinit_offset = MIN(level_offset - 1u, 15u) + 8;
-      const __m128i levels_start_offsets = _mm_set_epi32(48 * 3, 48 * 2, 48 * 1, 48 * 0);
-      const __m128i ctx_start_offsets = _mm_srli_epi32(levels_start_offsets, 1);
+      const uint32_t tinit_offset = MIN(level_offset - 1u, 15u);
+      const __m128i levels_start_offsets = _mm_set_epi32(16 * 3, 16 * 2, 16 * 1, 16 * 0);
       __m128i       tinit = _mm_i32gather_epi32(
-        (int*)state->m_absLevelsAndCtxInit[state_offset],
-        _mm_add_epi32(ctx_start_offsets, _mm_set1_epi32(tinit_offset)),
+        (int*)state->m_ctxInit[state_offset],
+        _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(tinit_offset)),
         2);
       tinit = _mm_and_si128(tinit, last_two_bytes);
       __m128i sum_abs = _mm_srli_epi32(tinit, 8);
@@ -1221,9 +1223,9 @@ static INLINE void update_states_avx2(
     else {
       for (int i = 0; i < 4; ++i) {
         const int state_id = state_offset + i;
-        uint8_t*  levels = (uint8_t*)(state->m_absLevelsAndCtxInit[state_id]);
+        uint8_t*  levels = (uint8_t*)(state->m_absLevels[state_id]);
         if (state->m_remRegBins[state_id] >= 4) {
-          coeff_t tinit = state->m_absLevelsAndCtxInit[state_id][8 + ((scan_pos - 1) & 15)];
+          coeff_t tinit = state->m_ctxInit[state_id][((scan_pos - 1) & 15)];
           coeff_t sumAbs1 = (tinit >> 3) & 31;
           coeff_t sumNum = tinit & 7;
 #define UPDATE(k)                                  \
@@ -1247,7 +1249,7 @@ static INLINE void update_states_avx2(
           memcpy(state->m_coeffFracBits[state_id], state->m_gtxFracBitsArray[gtxCtxOffsetNext + (sumGt1 < 4 ? sumGt1 : 4)], sizeof(state->m_coeffFracBits[0]));
 
 
-          coeff_t sumAbs = state->m_absLevelsAndCtxInit[state_id][8 + ((scan_pos - 1) & 15)] >> 8;
+          coeff_t sumAbs = state->m_ctxInit[state_id][((scan_pos - 1) & 15)] >> 8;
 #define UPDATE(k)                                  \
   {                                                \
     coeff_t t = levels[next_nb_info_ssb.inPos[k]]; \
@@ -1269,7 +1271,7 @@ static INLINE void update_states_avx2(
             state->m_goRicePar[state_id] = g_goRiceParsCoeff[sumAll];
           }
         } else {
-          coeff_t sumAbs = (state->m_absLevelsAndCtxInit[state_id][8 + ((scan_pos - 1) & 15)]) >> 8;
+          coeff_t sumAbs = (state->m_ctxInit[state_id][((scan_pos - 1) & 15)]) >> 8;
 #define UPDATE(k)                                  \
   {                                                \
     coeff_t t = levels[next_nb_info_ssb.inPos[k]]; \

From d850c346d646938639dbf2c2dac064984d544353 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Fri, 5 May 2023 16:21:31 +0300
Subject: [PATCH 234/254] [dep_quant] Change order of ctxInit

---
 src/dep_quant.c                     | 51 +++++++++-------
 src/dep_quant.h                     |  2 +-
 src/strategies/avx2/depquant-avx2.c | 93 ++++++++++++++---------------
 3 files changed, 75 insertions(+), 71 deletions(-)

diff --git a/src/dep_quant.c b/src/dep_quant.c
index b970d510..c47b6892 100644
--- a/src/dep_quant.c
+++ b/src/dep_quant.c
@@ -570,10 +570,11 @@ static INLINE void update_common_context(
   const int        prev_state,
   const int        curr_state)
 {
-  const uint32_t numSbb = width_in_sbb * height_in_sbb;
-  uint8_t* sbbFlags = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset + (curr_state & 3)].sbbFlags;
-  uint8_t* levels = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset + (curr_state & 3)].levels;
-  size_t setCpSize = cc->m_nbInfo[scan_pos - 1].maxDist * sizeof(uint8_t);
+  const uint32_t numSbb    = width_in_sbb * height_in_sbb;
+  const int     curr_state_without_offset         = curr_state & 3;
+  uint8_t*       sbbFlags  = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset + curr_state_without_offset].sbbFlags;
+  uint8_t*       levels    = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset + curr_state_without_offset].levels;
+  size_t         setCpSize = cc->m_nbInfo[scan_pos - 1].maxDist * sizeof(uint8_t);
   if (prev_state != -1 && ctxs->m_allStates.m_refSbbCtxId[prev_state] >= 0) {
     memcpy(sbbFlags, cc->m_allSbbCtx[cc->m_prev_sbb_ctx_offset + ctxs->m_allStates.m_refSbbCtxId[prev_state]].sbbFlags, numSbb * sizeof(uint8_t));
     memcpy(levels + scan_pos, cc->m_allSbbCtx[cc->m_prev_sbb_ctx_offset + ctxs->m_allStates.m_refSbbCtxId[prev_state]].levels + scan_pos, setCpSize);
@@ -596,11 +597,11 @@ static INLINE void update_common_context(
     ctxs->m_allStates.m_remRegBins[curr_state] = (ctxs->m_allStates.effWidth * ctxs->m_allStates.effHeight * ctxBinSampleRatio) / 16;
   }
   ctxs->m_allStates.m_goRicePar[curr_state] = 0;
-  ctxs->m_allStates.m_refSbbCtxId[curr_state] = curr_state & 3;
+  ctxs->m_allStates.m_refSbbCtxId[curr_state] = curr_state_without_offset;
   ctxs->m_allStates.m_sbbFracBits[curr_state][0] = cc->m_sbbFlagBits[sigNSbb][0];
   ctxs->m_allStates.m_sbbFracBits[curr_state][1] = cc->m_sbbFlagBits[sigNSbb][1];
 
-  uint16_t *templateCtxInit = ctxs->m_allStates.m_ctxInit[curr_state];
+  uint16_t *templateCtxInit = ctxs->m_allStates.m_ctxInit[ctxs->m_curr_state_offset >> 2];
   const int scanBeg = scan_pos - 16;
   const NbInfoOut* nbOut = cc->m_nbInfo + scanBeg;
   const uint8_t* absLevels = levels + scanBeg;
@@ -622,10 +623,10 @@ static INLINE void update_common_context(
         }
       }
 #undef UPDATE
-      templateCtxInit[id] = (uint16_t)(sumNum) + ((uint16_t)(sumAbs1) << 3) + ((uint16_t)MIN(127, sumAbs) << 8);
+      templateCtxInit[curr_state_without_offset + id * 4] = (uint16_t)(sumNum) + ((uint16_t)(sumAbs1) << 3) + ((uint16_t)MIN(127, sumAbs) << 8);
     }
     else {
-      templateCtxInit[id] = 0;
+      templateCtxInit[curr_state_without_offset + id * 4] = 0;
     }
   }
   memset(ctxs->m_allStates.m_absLevels[curr_state], 0, 16 * sizeof(uint8_t));
@@ -671,7 +672,7 @@ void uvg_dep_quant_update_state_eos(
     update_common_context(ctxs, state->m_commonCtx, scan_pos, cg_pos, width_in_sbb, height_in_sbb, next_sbb_right,
                           next_sbb_below, prvState, ctxs->m_curr_state_offset + decision_id);
 
-    coeff_t tinit = state->m_ctxInit[curr_state_offset][((scan_pos - 1) & 15)];
+    coeff_t tinit = state->m_ctxInit[ctxs->m_curr_state_offset  >> 2][((scan_pos - 1) & 15) * 4 + decision_id];
     coeff_t sumNum = tinit & 7;
     coeff_t sumAbs1 = (tinit >> 3) & 31;
     coeff_t sumGt1 = sumAbs1 - sumNum;
@@ -695,12 +696,13 @@ void uvg_dep_quant_update_state(
   const int       baseLevel,
   const bool      extRiceFlag,
   int             decision_id) {
-  all_depquant_states* state = &ctxs->m_allStates;
-  int state_id = ctxs->m_curr_state_offset + decision_id;
-  state->m_rdCost[state_id] = decisions->rdCost[decision_id];
-  if (decisions->prevId[decision_id] > -2) {
-    if (decisions->prevId[decision_id] >= 0) {
-      const int prvState = ctxs->m_prev_state_offset + decisions->prevId[decision_id];
+  all_depquant_states* state    = &ctxs->m_allStates;
+  int                  state_id = ctxs->m_curr_state_offset + decision_id;
+  state->m_rdCost[state_id]     = decisions->rdCost[decision_id];
+  int32_t prev_id_no_offset     = decisions->prevId[decision_id];
+  if (prev_id_no_offset > -2) {
+    if (prev_id_no_offset >= 0) {
+      const int prvState = ctxs->m_prev_state_offset + prev_id_no_offset;
       state->m_numSigSbb[state_id] = (state->m_numSigSbb[prvState]) || !!decisions->absLevel[decision_id];
       state->m_refSbbCtxId[state_id] = state->m_refSbbCtxId[prvState];
       state->m_sbbFracBits[state_id][0] = state->m_sbbFracBits[prvState][0];
@@ -713,7 +715,9 @@ void uvg_dep_quant_update_state(
                                             : 3);
       }
       memcpy(state->m_absLevels[state_id], state->m_absLevels[prvState], 16 * sizeof(uint8_t));
-      memcpy(state->m_ctxInit[state_id], state->m_ctxInit[prvState], 16 * sizeof(uint16_t));
+      for (int i = 0; i < 64; i += 4) {
+        state->m_ctxInit[ctxs->m_curr_state_offset  >> 2][decision_id + i] = state->m_ctxInit[ctxs->m_prev_state_offset  >> 2][prev_id_no_offset + i];
+      }
     }
     else {
       state->m_numSigSbb[state_id] = 1;
@@ -723,7 +727,9 @@ void uvg_dep_quant_update_state(
       state->m_remRegBins[state_id] = (state->effWidth * state->effHeight * ctxBinSampleRatio) / 16 - (
         decisions->absLevel[decision_id] < 2 ? (unsigned)decisions->absLevel[decision_id] : 3);
       memset(state->m_absLevels[state_id], 0, 16 * sizeof(uint8_t));
-      memset(state->m_ctxInit[state_id], 0, 16 * sizeof(uint16_t));
+      for (int i = 0; i < 64; i += 4) {
+        state->m_ctxInit[ctxs->m_curr_state_offset >> 2][decision_id + i] = 0;
+      }
     }
     state->all_gte_four &= state->m_remRegBins[state_id] >= 4;
     state->all_lt_four &= state->m_remRegBins[state_id] < 4;
@@ -731,7 +737,7 @@ void uvg_dep_quant_update_state(
     levels[scan_pos & 15] = (uint8_t)MIN(32, decisions->absLevel[decision_id]);
 
     if (state->m_remRegBins[state_id] >= 4) {
-      coeff_t tinit = state->m_ctxInit[state_id][((scan_pos - 1) & 15)];
+      coeff_t tinit = state->m_ctxInit[ctxs->m_curr_state_offset  >> 2][((scan_pos - 1) & 15) * 4 + decision_id];
       coeff_t sumAbs1 = (tinit >> 3) & 31;
       coeff_t sumNum = tinit & 7;
 #define UPDATE(k) {coeff_t t=levels[next_nb_info_ssb.inPos[k]]; sumAbs1+=MIN(4+(t&1),t); sumNum+=!!t; }
@@ -753,7 +759,7 @@ void uvg_dep_quant_update_state(
              sizeof(state->m_coeffFracBits[0]));
 
 
-      coeff_t sumAbs = state->m_ctxInit[state_id][(scan_pos - 1) & 15] >> 8;
+      coeff_t sumAbs = state->m_ctxInit[ctxs->m_curr_state_offset  >> 2][((scan_pos - 1) & 15) * 4 + decision_id] >> 8;
 #define UPDATE(k) {coeff_t t=levels[next_nb_info_ssb.inPos[k]]; sumAbs+=t; }
       switch (numIPos) {
         case 5: UPDATE(4);
@@ -777,7 +783,7 @@ void uvg_dep_quant_update_state(
       }
     }
     else {
-      coeff_t sumAbs = (state->m_ctxInit[state_id][(scan_pos - 1) & 15]) >> 8;
+      coeff_t sumAbs = state->m_ctxInit[ctxs->m_curr_state_offset  >> 2][((scan_pos - 1) & 15) * 4 + decision_id] >> 8;
 #define UPDATE(k) {coeff_t t=levels[next_nb_info_ssb.inPos[k]]; sumAbs+=t; }
       switch (numIPos) {
         case 5: UPDATE(4);
@@ -1055,7 +1061,10 @@ int uvg_dep_quant(
         height,
         compID != 0); //tu.cu->slice->getReverseLastSigCoeffFlag());
     }
-    if(0){
+    for (int i = 0; i < 8; ++i) {
+      assert(ctxs->m_allStates.m_refSbbCtxId[i] < 5);
+    }
+    if(1){
       printf("%d\n", scanIdx);
       for (int i = 0; i < 4; i++) {
         printf("%lld %hu %d\n", ctxs->m_trellis[scanIdx].rdCost[i], ctxs->m_trellis[scanIdx].absLevel[i], ctxs->m_trellis[scanIdx].prevId[i]);
diff --git a/src/dep_quant.h b/src/dep_quant.h
index 45220706..bd5ef363 100644
--- a/src/dep_quant.h
+++ b/src/dep_quant.h
@@ -150,7 +150,7 @@ typedef struct {
 typedef struct {
   int64_t  ALIGNED(32) m_rdCost[12];
   uint8_t  ALIGNED(32) m_absLevels[12][16]; 
-  uint16_t ALIGNED(32) m_ctxInit[12][16]; 
+  uint16_t ALIGNED(32) m_ctxInit[3][16 * 4]; 
   int8_t          ALIGNED(16) m_numSigSbb[12];
   int             ALIGNED(32) m_remRegBins[12];
   int8_t          ALIGNED(16) m_refSbbCtxId[12];
diff --git a/src/strategies/avx2/depquant-avx2.c b/src/strategies/avx2/depquant-avx2.c
index fb53713d..9d40e496 100644
--- a/src/strategies/avx2/depquant-avx2.c
+++ b/src/strategies/avx2/depquant-avx2.c
@@ -637,7 +637,6 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
           temp[id % 4] = 0;
           if (id % 4 == 3) {
             all[id / 4] = _mm256_loadu_si256((__m256i const*)temp);
-            all[id / 4] = _mm256_shuffle_epi8(all[id / 4], v_shuffle);
           }
           continue;
         }
@@ -726,35 +725,14 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
         temp[id % 4] = _mm_extract_epi64(shuffled_template_ctx_init, 0);
         if (id % 4 == 3) {
           all[id / 4] = _mm256_loadu_si256((__m256i const*)temp);
-          all[id / 4] = _mm256_shuffle_epi8(all[id / 4], v_shuffle);
           last = template_ctx_init;
         }
       }
-
-      __m256i* v_src_tmp = all;
-
-      __m256i v_tmp[4];
-      v_tmp[0] = _mm256_permute2x128_si256(v_src_tmp[0], v_src_tmp[1], 0x20);
-      v_tmp[1] = _mm256_permute2x128_si256(v_src_tmp[0], v_src_tmp[1], 0x31);
-      v_tmp[2] = _mm256_permute2x128_si256(v_src_tmp[2], v_src_tmp[3], 0x20);
-      v_tmp[3] = _mm256_permute2x128_si256(v_src_tmp[2], v_src_tmp[3], 0x31);
-
-      __m256i v_tmp16_lo[2];
-      __m256i v_tmp16_hi[2];
-      v_tmp16_lo[0] = _mm256_unpacklo_epi32(v_tmp[0], v_tmp[1]);
-      v_tmp16_lo[1] = _mm256_unpacklo_epi32(v_tmp[2], v_tmp[3]);
-      v_tmp16_hi[0] = _mm256_unpackhi_epi32(v_tmp[0], v_tmp[1]);
-      v_tmp16_hi[1] = _mm256_unpackhi_epi32(v_tmp[2], v_tmp[3]);
-
-      v_tmp[0] = _mm256_permute4x64_epi64(v_tmp16_lo[0], _MM_SHUFFLE(3, 1, 2, 0));
-      v_tmp[1] = _mm256_permute4x64_epi64(v_tmp16_lo[1], _MM_SHUFFLE(3, 1, 2, 0));
-      v_tmp[2] = _mm256_permute4x64_epi64(v_tmp16_hi[0], _MM_SHUFFLE(3, 1, 2, 0));
-      v_tmp[3] = _mm256_permute4x64_epi64(v_tmp16_hi[1], _MM_SHUFFLE(3, 1, 2, 0));
-
-      _mm256_storeu_si256((__m256i*)(state->m_ctxInit[state_offset]),  _mm256_permute2x128_si256(v_tmp[0], v_tmp[1], 0x20));
-      _mm256_storeu_si256((__m256i*)(state->m_ctxInit[state_offset + 1]),  _mm256_permute2x128_si256(v_tmp[0], v_tmp[1], 0x31));
-      _mm256_storeu_si256((__m256i*)(state->m_ctxInit[state_offset + 2]),  _mm256_permute2x128_si256(v_tmp[2], v_tmp[3], 0x20));
-      _mm256_storeu_si256((__m256i*)(state->m_ctxInit[state_offset + 3]),  _mm256_permute2x128_si256(v_tmp[2], v_tmp[3], 0x31));
+      
+      _mm256_storeu_si256((__m256i*)(&state->m_ctxInit[state_offset >> 2][0]), all[0]);
+      _mm256_storeu_si256((__m256i*)(&state->m_ctxInit[state_offset >> 2][16]), all[1]);
+      _mm256_storeu_si256((__m256i*)(&state->m_ctxInit[state_offset >> 2][32]), all[2]);
+      _mm256_storeu_si256((__m256i*)(&state->m_ctxInit[state_offset >> 2][48]), all[3]);
 
       for (int i = 0; i < 4; ++i) {
         memset(state->m_absLevels[state_offset + i], 0, 16);
@@ -836,9 +814,9 @@ static INLINE void update_states_avx2(
 
     __m128i abs_level = _mm_load_si128((__m128i const*)decisions->absLevel);
     if (all_non_negative) {
-      __m128i prv_states  = _mm_load_si128((__m128i const*)decisions->prevId);
+      __m128i prv_states_o  = _mm_load_si128((__m128i const*)decisions->prevId);
       __m128i prev_offset = _mm_set1_epi32(ctxs->m_prev_state_offset);
-      prv_states = _mm_add_epi32(prv_states, prev_offset);
+      __m128i prv_states     = _mm_add_epi32(prv_states_o, prev_offset);
       __m128i control = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
       __m128i shuffled_prev_states = _mm_shuffle_epi8(prv_states, control);
       
@@ -887,8 +865,20 @@ static INLINE void update_states_avx2(
       int32_t prv_states_scalar[4];
       _mm_storeu_si128((__m128i*)prv_states_scalar, prv_states);
       for (int i = 0; i < 4; ++i) {
-        memcpy(state->m_absLevels[state_offset + i], state->m_absLevels[prv_states_scalar[i]], 16 * sizeof(uint8_t));        
-        memcpy(state->m_ctxInit[state_offset + i], state->m_ctxInit[prv_states_scalar[i]], 16 * sizeof(uint16_t));        
+        memcpy(state->m_absLevels[state_offset + i], state->m_absLevels[prv_states_scalar[i]], 16 * sizeof(uint8_t));       
+      }
+      __m256i prev_state_full = _mm256_load_si256((__m256i const*)decisions->prevId);
+      __m256i shuffle_mask = _mm256_setr_epi8(0, 0, 4, 4,8, 8, 12, 12, 0, 0, 4, 4, 8, 8, 12, 12,0, 0, 0, 0,0, 0, 0, 0,16, 16, 16, 16, 16, 16, 16, 16);
+      prev_state_full = _mm256_shuffle_epi8(prev_state_full, shuffle_mask);
+      prev_state_full  = _mm256_permute4x64_epi64(prev_state_full, 0);
+      prev_state_full  = _mm256_slli_epi16(prev_state_full, 1);
+      __m256i temp_add = _mm256_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 8, 9, 8, 9, 8, 9, 16, 17, 16, 17,16, 17,16, 17, 24, 25,24,25,24,25,24,25);
+      prev_state_full = _mm256_add_epi8(prev_state_full, temp_add);
+
+      for (int i = 0; i < 64; i += (256 / 8 / sizeof(uint16_t))) {
+         __m256i data   = _mm256_load_si256((__m256i*)(&state->m_ctxInit[(ctxs->m_prev_state_offset >> 2)][i]));
+        data = _mm256_shuffle_epi8(data, prev_state_full);
+        _mm256_store_si256((__m256i*)(&state->m_ctxInit[(state_offset >> 2)][i]), data);
       }
     }
     else if (all_minus_one) {
@@ -914,7 +904,7 @@ static INLINE void update_states_avx2(
       rem_reg_all_lt4 = (bit_mask == 0xFFFF);
       
       memset(state->m_absLevels[state_offset], 0, 16 * sizeof(uint8_t) * 4);
-      memset(state->m_ctxInit[state_offset], 0, 16 * sizeof(uint16_t) * 4);
+      memset(state->m_ctxInit[state_offset >> 2], 0, 16 * sizeof(uint16_t) * 4);
       
     }
     else {
@@ -933,7 +923,9 @@ static INLINE void update_states_avx2(
             state->m_remRegBins[state_id] -= (decisions->absLevel[decision_id] < 2 ? (unsigned)decisions->absLevel[decision_id] : 3);
           }
           memcpy(state->m_absLevels[state_id], state->m_absLevels[prvState], 16 * sizeof(uint8_t));
-          memcpy(state->m_ctxInit[state_id], state->m_ctxInit[prvState], 16 * sizeof(uint16_t));
+          for (int k = 0; k < 16; ++k) {
+            state->m_ctxInit[state_offset >> 2][k * 4 + i] = state->m_ctxInit[ctxs->m_prev_state_offset >> 2][k * 4 + decisions->prevId[decision_id]];
+          }
         } else {
           state->m_numSigSbb[state_id] = 1;
           state->m_refSbbCtxId[state_id] = -1;
@@ -941,7 +933,9 @@ static INLINE void update_states_avx2(
           //(scanInfo.chType == CHANNEL_TYPE_LUMA) ? MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_LUMA : MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_CHROMA;
           state->m_remRegBins[state_id] = (state->effWidth * state->effHeight * ctxBinSampleRatio) / 16 - (decisions->absLevel[decision_id] < 2 ? (unsigned)decisions->absLevel[decision_id] : 3);
           memset(state->m_absLevels[state_id], 0, 16 * sizeof(uint8_t));
-          memset(state->m_ctxInit[state_id], 0, 16 * sizeof(uint16_t));
+          for (int k = 0; k < 16; ++k) {
+            state->m_ctxInit[state_offset >> 2][k * 4 + i] = 0;
+          }
         }
         rem_reg_all_gte_4 &= state->m_remRegBins[state_id] >= 4;
         rem_reg_all_lt4 &= state->m_remRegBins[state_id] < 4;
@@ -958,16 +952,12 @@ static INLINE void update_states_avx2(
     state->all_gte_four = rem_reg_all_gte_4;
     state->all_lt_four = rem_reg_all_lt4;
     if (rem_reg_all_gte_4) {
-      const __m128i  first_two_bytes = _mm_set1_epi32(0xffff);
       const __m128i  first_byte = _mm_set1_epi32(0xff);
       const __m128i  ones = _mm_set1_epi32(1);
       const uint32_t tinit_offset = MIN(level_offset - 1u, 15u);
       const __m128i levels_start_offsets = _mm_set_epi32(16 * 3, 16 * 2, 16 * 1, 16 * 0);
-      __m128i        tinit = _mm_i32gather_epi32(
-        (int *)state->m_ctxInit[state_offset],
-        _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(tinit_offset)),
-        2);
-      tinit = _mm_and_si128(tinit, first_two_bytes);
+      __m128i   tinit = _mm_loadu_si128((__m128i*)(&state->m_ctxInit[state_offset >> 2][tinit_offset * 4]));
+      tinit = _mm_cvtepi16_epi32(tinit); 
       __m128i sum_abs1 = _mm_and_si128(_mm_srli_epi32(tinit, 3), _mm_set1_epi32(31));
       __m128i sum_num = _mm_and_si128(tinit, _mm_set1_epi32(7));
 
@@ -1149,15 +1139,11 @@ static INLINE void update_states_avx2(
     else if (rem_reg_all_lt4) {
       const __m128i first_byte = _mm_set1_epi32(0xff);
       uint8_t*       levels = (uint8_t*)state->m_absLevels[state_offset];
-      const __m128i  last_two_bytes = _mm_set1_epi32(0xffff);
       const __m128i  last_byte = _mm_set1_epi32(0xff);
       const uint32_t tinit_offset = MIN(level_offset - 1u, 15u);
       const __m128i levels_start_offsets = _mm_set_epi32(16 * 3, 16 * 2, 16 * 1, 16 * 0);
-      __m128i       tinit = _mm_i32gather_epi32(
-        (int*)state->m_ctxInit[state_offset],
-        _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(tinit_offset)),
-        2);
-      tinit = _mm_and_si128(tinit, last_two_bytes);
+      __m128i   tinit = _mm_loadu_si128((__m128i*)(&state->m_ctxInit[state_offset >> 2][tinit_offset * 4]));
+      tinit = _mm_cvtepi16_epi32(tinit); 
       __m128i sum_abs = _mm_srli_epi32(tinit, 8);
       sum_abs         = _mm_min_epi32(sum_abs, _mm_set1_epi32(51));
       switch (numIPos) {
@@ -1225,7 +1211,7 @@ static INLINE void update_states_avx2(
         const int state_id = state_offset + i;
         uint8_t*  levels = (uint8_t*)(state->m_absLevels[state_id]);
         if (state->m_remRegBins[state_id] >= 4) {
-          coeff_t tinit = state->m_ctxInit[state_id][((scan_pos - 1) & 15)];
+          coeff_t tinit = state->m_ctxInit[state_offset >> 2][((scan_pos - 1) & 15) * 4 + i];
           coeff_t sumAbs1 = (tinit >> 3) & 31;
           coeff_t sumNum = tinit & 7;
 #define UPDATE(k)                                  \
@@ -1249,7 +1235,7 @@ static INLINE void update_states_avx2(
           memcpy(state->m_coeffFracBits[state_id], state->m_gtxFracBitsArray[gtxCtxOffsetNext + (sumGt1 < 4 ? sumGt1 : 4)], sizeof(state->m_coeffFracBits[0]));
 
 
-          coeff_t sumAbs = state->m_ctxInit[state_id][((scan_pos - 1) & 15)] >> 8;
+          coeff_t sumAbs = state->m_ctxInit[state_offset >> 2][((scan_pos - 1) & 15) * 4 + i] >> 8;
 #define UPDATE(k)                                  \
   {                                                \
     coeff_t t = levels[next_nb_info_ssb.inPos[k]]; \
@@ -1271,7 +1257,7 @@ static INLINE void update_states_avx2(
             state->m_goRicePar[state_id] = g_goRiceParsCoeff[sumAll];
           }
         } else {
-          coeff_t sumAbs = (state->m_ctxInit[state_id][((scan_pos - 1) & 15)]) >> 8;
+          coeff_t sumAbs = (state->m_ctxInit[state_offset >> 2][((scan_pos - 1) & 15) * 4 + i]) >> 8;
 #define UPDATE(k)                                  \
   {                                                \
     coeff_t t = levels[next_nb_info_ssb.inPos[k]]; \
@@ -1355,6 +1341,15 @@ void uvg_dep_quant_decide_and_update_avx2(
     } else if (!zeroOut) {
       update_states_avx2(ctxs, next_nb_info_ssb.num, scan_pos, decisions, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], next_nb_info_ssb, 4, false);
     }
+    //for (int i = 0; i<4; i++) {
+    //  for (int k = 0; k < 16; ++k) {
+    //    printf(
+    //      "%3d ",
+    //      ctxs->m_allStates.m_ctxInit[ctxs->m_curr_state_offset / 4][k * 4 + i]);
+    //  }
+    //  printf("\n");
+    //}
+    //printf("\n");
 
     if (spt == SCAN_SOCSBB) {
       SWAP(ctxs->m_skip_state_offset, ctxs->m_prev_state_offset, int);

From 915104cf104b48c80abbbb2753391c7cf67d99a1 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Mon, 8 May 2023 16:34:10 +0300
Subject: [PATCH 235/254] [dep_quant] Change order of absLevels

---
 src/dep_quant.c                     |  93 +++++----
 src/dep_quant.h                     |   4 +-
 src/strategies/avx2/depquant-avx2.c | 310 +++++++++++++---------------
 3 files changed, 199 insertions(+), 208 deletions(-)

diff --git a/src/dep_quant.c b/src/dep_quant.c
index c47b6892..2656f9aa 100644
--- a/src/dep_quant.c
+++ b/src/dep_quant.c
@@ -325,12 +325,12 @@ static void reset_common_context(common_context* ctx, const rate_estimator_t * r
   memcpy(&ctx->m_sbbFlagBits, &rate_estimator->m_sigSbbFracBits, sizeof(rate_estimator->m_sigSbbFracBits));
   uint8_t*  next_sbb_memory   = ctx->sbb_memory;
   uint8_t*  next_level_memory   = ctx->level_memory;
-  for (int k = 0; k < 8; k++, next_sbb_memory += numSbb, next_level_memory += num_coeff) {
+  for (int k = 0; k < 2; k++, next_sbb_memory += numSbb * 4llu, next_level_memory += num_coeff * 4llu) {
     ctx->m_allSbbCtx[k].sbbFlags = next_sbb_memory;
     ctx->m_allSbbCtx[k].levels = next_level_memory;
   }
   ctx->m_curr_sbb_ctx_offset = 0;
-  ctx->m_prev_sbb_ctx_offset = 4;
+  ctx->m_prev_sbb_ctx_offset = 1;
   ctx->num_coeff = num_coeff;
 }
 
@@ -570,23 +570,35 @@ static INLINE void update_common_context(
   const int        prev_state,
   const int        curr_state)
 {
-  const uint32_t numSbb    = width_in_sbb * height_in_sbb;
-  const int     curr_state_without_offset         = curr_state & 3;
-  uint8_t*       sbbFlags  = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset + curr_state_without_offset].sbbFlags;
-  uint8_t*       levels    = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset + curr_state_without_offset].levels;
+  const uint32_t numSbb = width_in_sbb * height_in_sbb;
+  const int      curr_state_without_offset = curr_state & 3;
+  uint8_t*       sbbFlags = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].sbbFlags;
+  uint8_t*       levels = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].levels;
   size_t         setCpSize = cc->m_nbInfo[scan_pos - 1].maxDist * sizeof(uint8_t);
-  if (prev_state != -1 && ctxs->m_allStates.m_refSbbCtxId[prev_state] >= 0) {
-    memcpy(sbbFlags, cc->m_allSbbCtx[cc->m_prev_sbb_ctx_offset + ctxs->m_allStates.m_refSbbCtxId[prev_state]].sbbFlags, numSbb * sizeof(uint8_t));
-    memcpy(levels + scan_pos, cc->m_allSbbCtx[cc->m_prev_sbb_ctx_offset + ctxs->m_allStates.m_refSbbCtxId[prev_state]].levels + scan_pos, setCpSize);
+  int8_t         prev_sbb_state = ctxs->m_allStates.m_refSbbCtxId[prev_state];
+  if (prev_state != -1 && prev_sbb_state >= 0) {
+    for (int i = 0; i < numSbb; ++i) {
+      sbbFlags[i * 4 + curr_state_without_offset] = cc->m_allSbbCtx[cc->m_prev_sbb_ctx_offset].sbbFlags[i * 4 + prev_sbb_state];
+    }
+    for (int i = 16; i < setCpSize; ++i) {
+      levels[scan_pos * 4 + i * 4 + curr_state_without_offset] = cc->m_allSbbCtx[cc->m_prev_sbb_ctx_offset].sbbFlags[scan_pos * 4 + i * 4 + prev_sbb_state];
+    }
   }
   else {
-    memset(sbbFlags, 0, numSbb * sizeof(uint8_t));
-    memset(levels + scan_pos, 0, setCpSize);
+    for (int i = 0; i < numSbb; ++i) {
+      sbbFlags[i * 4 + curr_state_without_offset] = 0;
+    }
+    for (int i = 16; i < setCpSize; ++i) {
+      levels[scan_pos * 4 + i * 4 + curr_state_without_offset] = 0;
+    }
+  }
+  sbbFlags[cg_pos * 4 + curr_state_without_offset] = !!ctxs->m_allStates.m_numSigSbb[curr_state];
+  for (int i = 0; i < 16; ++i) {
+    levels[scan_pos * 4 + i * 4 + curr_state_without_offset] = ctxs->m_allStates.m_absLevels[curr_state / 4][i * 4 + curr_state_without_offset];
   }
-  sbbFlags[cg_pos] = !!ctxs->m_allStates.m_numSigSbb[curr_state];
-  memcpy(levels + scan_pos, ctxs->m_allStates.m_absLevels[curr_state], 16 * sizeof(uint8_t));
 
-  const int       sigNSbb = ((next_sbb_right ? sbbFlags[next_sbb_right] : false) || (next_sbb_below ? sbbFlags[next_sbb_below] : false) ? 1 : 0);
+  const int sigNSbb = ((next_sbb_right ? sbbFlags[next_sbb_right * 4 + curr_state_without_offset] : false) 
+                       || (next_sbb_below ? sbbFlags[next_sbb_below* 4 + curr_state_without_offset] : false) ? 1 : 0);
   ctxs->m_allStates.m_numSigSbb[curr_state] = 0;
   if (prev_state != -1) {
     ctxs->m_allStates.m_remRegBins[curr_state] = ctxs->m_allStates.m_remRegBins[prev_state];
@@ -604,11 +616,11 @@ static INLINE void update_common_context(
   uint16_t *templateCtxInit = ctxs->m_allStates.m_ctxInit[ctxs->m_curr_state_offset >> 2];
   const int scanBeg = scan_pos - 16;
   const NbInfoOut* nbOut = cc->m_nbInfo + scanBeg;
-  const uint8_t* absLevels = levels + scanBeg;
+  const uint8_t* absLevels = levels + scanBeg * 4;
   for (int id = 0; id < 16; id++, nbOut++) {
     if (nbOut->num) {
       coeff_t sumAbs = 0, sumAbs1 = 0, sumNum = 0;
-#define UPDATE(k) {coeff_t t=absLevels[nbOut->outPos[k]]; sumAbs+=t; sumAbs1+=MIN(4+(t&1),t); sumNum+=!!t; }
+#define UPDATE(k) {coeff_t t=absLevels[nbOut->outPos[k] * 4 + curr_state_without_offset]; sumAbs+=t; sumAbs1+=MIN(4+(t&1),t); sumNum+=!!t; }
       UPDATE(0);
       if (nbOut->num > 1) {
         UPDATE(1);
@@ -623,13 +635,15 @@ static INLINE void update_common_context(
         }
       }
 #undef UPDATE
-      templateCtxInit[curr_state_without_offset + id * 4] = (uint16_t)(sumNum) + ((uint16_t)(sumAbs1) << 3) + ((uint16_t)MIN(127, sumAbs) << 8);
+      templateCtxInit[curr_state_without_offset + id * 4] = (uint16_t)(sumNum) + ((uint16_t)(sumAbs1 << 3)) + (uint16_t)(MIN(127, sumAbs) << 8);
     }
     else {
       templateCtxInit[curr_state_without_offset + id * 4] = 0;
     }
   }
-  memset(ctxs->m_allStates.m_absLevels[curr_state], 0, 16 * sizeof(uint8_t));
+  for (int i = curr_state_without_offset; i < 64; i += 4) {
+    ctxs->m_allStates.m_absLevels[curr_state >> 2][i] = 0;
+  }
 }
 
 
@@ -655,18 +669,25 @@ void uvg_dep_quant_update_state_eos(
     if (decisions->prevId[decision_id] >= 4) {
       prvState = ctxs->m_skip_state_offset + (decisions->prevId[decision_id] - 4);
       state->m_numSigSbb[curr_state_offset] = 0;
-      memset(state->m_absLevels[curr_state_offset], 0, 16 * sizeof(uint8_t));
+      for (int i = decision_id; i < 64;  i += 4) {
+        state->m_absLevels[ctxs->m_curr_state_offset / 4][i] = 0;
+      }
     }
     else if (decisions->prevId[decision_id] >= 0) {
       prvState = ctxs->m_prev_state_offset + decisions->prevId[decision_id];
       state->m_numSigSbb[curr_state_offset] = state->m_numSigSbb[prvState] || !!decisions->absLevel[decision_id];
-      memcpy(state->m_absLevels[curr_state_offset], state->m_absLevels[prvState], 16 * sizeof(uint8_t));
+      for (int i = 0; i < 64;  i += 4) {
+        state->m_absLevels[ctxs->m_curr_state_offset / 4][i + decision_id] =
+          state->m_absLevels[ctxs->m_prev_state_offset / 4][i + decisions->prevId[decision_id]];
+      }
     }
     else {
       state->m_numSigSbb[curr_state_offset] = 1;
-      memset(state->m_absLevels[curr_state_offset], 0, 16 * sizeof(uint8_t));
+      for (int i = decision_id; i < 64; i += 4) {
+        state->m_absLevels[ctxs->m_curr_state_offset / 4][i] = 0;
+      }
     }
-    uint8_t* temp = &state->m_absLevels[curr_state_offset][scan_pos & 15];
+    uint8_t* temp = &state->m_absLevels[ctxs->m_curr_state_offset / 4][(scan_pos & 15) * 4 + decision_id];
     *temp = (uint8_t)MIN(51, decisions->absLevel[decision_id]);
 
     update_common_context(ctxs, state->m_commonCtx, scan_pos, cg_pos, width_in_sbb, height_in_sbb, next_sbb_right,
@@ -714,10 +735,12 @@ void uvg_dep_quant_update_state(
                                             ? (unsigned)decisions->absLevel[decision_id]
                                             : 3);
       }
-      memcpy(state->m_absLevels[state_id], state->m_absLevels[prvState], 16 * sizeof(uint8_t));
       for (int i = 0; i < 64; i += 4) {
         state->m_ctxInit[ctxs->m_curr_state_offset  >> 2][decision_id + i] = state->m_ctxInit[ctxs->m_prev_state_offset  >> 2][prev_id_no_offset + i];
       }
+      for (int i = 0; i < 64; i += 4) {
+        state->m_absLevels[ctxs->m_curr_state_offset  >> 2][decision_id + i] = state->m_absLevels[ctxs->m_prev_state_offset  >> 2][prev_id_no_offset + i];
+      }
     }
     else {
       state->m_numSigSbb[state_id] = 1;
@@ -726,21 +749,23 @@ void uvg_dep_quant_update_state(
       //(scanInfo.chType == CHANNEL_TYPE_LUMA) ? MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_LUMA : MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_CHROMA;
       state->m_remRegBins[state_id] = (state->effWidth * state->effHeight * ctxBinSampleRatio) / 16 - (
         decisions->absLevel[decision_id] < 2 ? (unsigned)decisions->absLevel[decision_id] : 3);
-      memset(state->m_absLevels[state_id], 0, 16 * sizeof(uint8_t));
-      for (int i = 0; i < 64; i += 4) {
-        state->m_ctxInit[ctxs->m_curr_state_offset >> 2][decision_id + i] = 0;
+      for (int i = decision_id; i < 64; i += 4) {
+        state->m_absLevels[ctxs->m_curr_state_offset >> 2][i] = 0;
+      }
+      for (int i = decision_id; i < 64; i += 4) {
+        state->m_ctxInit[ctxs->m_curr_state_offset >> 2][i] = 0;
       }
     }
     state->all_gte_four &= state->m_remRegBins[state_id] >= 4;
     state->all_lt_four &= state->m_remRegBins[state_id] < 4;
-    uint8_t* levels = state->m_absLevels[state_id];
-    levels[scan_pos & 15] = (uint8_t)MIN(32, decisions->absLevel[decision_id]);
+    uint8_t* levels = state->m_absLevels[ctxs->m_curr_state_offset >> 2];
+    levels[(scan_pos & 15) * 4 + decision_id] = (uint8_t)MIN(32, decisions->absLevel[decision_id]);
 
     if (state->m_remRegBins[state_id] >= 4) {
       coeff_t tinit = state->m_ctxInit[ctxs->m_curr_state_offset  >> 2][((scan_pos - 1) & 15) * 4 + decision_id];
       coeff_t sumAbs1 = (tinit >> 3) & 31;
       coeff_t sumNum = tinit & 7;
-#define UPDATE(k) {coeff_t t=levels[next_nb_info_ssb.inPos[k]]; sumAbs1+=MIN(4+(t&1),t); sumNum+=!!t; }
+#define UPDATE(k) {coeff_t t=levels[next_nb_info_ssb.inPos[k] * 4 + decision_id]; sumAbs1+=MIN(4+(t&1),t); sumNum+=!!t; }
       switch (numIPos) {
         case 5: UPDATE(4);
         case 4: UPDATE(3);
@@ -760,7 +785,7 @@ void uvg_dep_quant_update_state(
 
 
       coeff_t sumAbs = state->m_ctxInit[ctxs->m_curr_state_offset  >> 2][((scan_pos - 1) & 15) * 4 + decision_id] >> 8;
-#define UPDATE(k) {coeff_t t=levels[next_nb_info_ssb.inPos[k]]; sumAbs+=t; }
+#define UPDATE(k) {coeff_t t=levels[next_nb_info_ssb.inPos[k] * 4 + decision_id]; sumAbs+=t; }
       switch (numIPos) {
         case 5: UPDATE(4);
         case 4: UPDATE(3);
@@ -784,7 +809,7 @@ void uvg_dep_quant_update_state(
     }
     else {
       coeff_t sumAbs = state->m_ctxInit[ctxs->m_curr_state_offset  >> 2][((scan_pos - 1) & 15) * 4 + decision_id] >> 8;
-#define UPDATE(k) {coeff_t t=levels[next_nb_info_ssb.inPos[k]]; sumAbs+=t; }
+#define UPDATE(k) {coeff_t t=levels[next_nb_info_ssb.inPos[k] * 4 + decision_id]; sumAbs+=t; }
       switch (numIPos) {
         case 5: UPDATE(4);
         case 4: UPDATE(3);
@@ -1061,10 +1086,8 @@ int uvg_dep_quant(
         height,
         compID != 0); //tu.cu->slice->getReverseLastSigCoeffFlag());
     }
-    for (int i = 0; i < 8; ++i) {
-      assert(ctxs->m_allStates.m_refSbbCtxId[i] < 5);
-    }
-    if(1){
+
+    if(0){
       printf("%d\n", scanIdx);
       for (int i = 0; i < 4; i++) {
         printf("%lld %hu %d\n", ctxs->m_trellis[scanIdx].rdCost[i], ctxs->m_trellis[scanIdx].absLevel[i], ctxs->m_trellis[scanIdx].prevId[i]);
diff --git a/src/dep_quant.h b/src/dep_quant.h
index bd5ef363..6ef54f4d 100644
--- a/src/dep_quant.h
+++ b/src/dep_quant.h
@@ -119,7 +119,7 @@ typedef struct {
 typedef struct {
   const NbInfoOut* m_nbInfo;
   uint32_t         m_sbbFlagBits[2][2];
-  SbbCtx           m_allSbbCtx[8];
+  SbbCtx           m_allSbbCtx[2];
   int              m_curr_sbb_ctx_offset;
   int              m_prev_sbb_ctx_offset;
   uint8_t          sbb_memory[8 * 1024];
@@ -149,7 +149,7 @@ typedef struct {
 } depquant_state;
 typedef struct {
   int64_t  ALIGNED(32) m_rdCost[12];
-  uint8_t  ALIGNED(32) m_absLevels[12][16]; 
+  uint8_t  ALIGNED(32) m_absLevels[3][16 * 4]; 
   uint16_t ALIGNED(32) m_ctxInit[3][16 * 4]; 
   int8_t          ALIGNED(16) m_numSigSbb[12];
   int             ALIGNED(32) m_remRegBins[12];
diff --git a/src/strategies/avx2/depquant-avx2.c b/src/strategies/avx2/depquant-avx2.c
index 9d40e496..1a00be56 100644
--- a/src/strategies/avx2/depquant-avx2.c
+++ b/src/strategies/avx2/depquant-avx2.c
@@ -497,6 +497,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
     __m128i prev_state;
     __m128i prev_state_no_offset;
     __m128i abs_level = _mm_load_si128((const __m128i*)decisions->absLevel);
+    __m128i control = _mm_setr_epi8(0, 4, 8, 12, 0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1);
     if (all_above_four) {
       prev_state = _mm_set1_epi32(ctxs->m_skip_state_offset);
       prev_state_no_offset = _mm_sub_epi32(_mm_load_si128((const __m128i*)decisions->prevId), _mm_set1_epi32(4));
@@ -505,16 +506,14 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
             prev_state_no_offset
       );
       memset(&state->m_numSigSbb[state_offset], 0, 4);
-      for (int i = 0; i < 4; ++i) {
-        memset(state->m_absLevels[state_offset + i], 0, 16 * sizeof(uint8_t));    
-      }
+      memset(state->m_absLevels[state_offset >> 2], 0, 64 * sizeof(uint8_t));    
+      
     } else if (all_between_zero_and_three) {
-      prev_state_no_offset = _mm_set1_epi32(ctxs->m_prev_state_offset);
+      prev_state_no_offset = _mm_load_si128((const __m128i*)decisions->prevId);
       prev_state = _mm_add_epi32(
         prev_state_no_offset,
-        _mm_load_si128((const __m128i*)decisions->prevId)
+        _mm_set1_epi32(ctxs->m_prev_state_offset)
       );
-      __m128i control = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
       __m128i prev_state_with_ff_high_bytes = _mm_or_si128(prev_state, _mm_set1_epi32(0xffffff00));
       __m128i num_sig_sbb = _mm_load_si128((const __m128i*)state->m_numSigSbb);
       num_sig_sbb = _mm_shuffle_epi8(num_sig_sbb, prev_state_with_ff_high_bytes);
@@ -527,10 +526,15 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
       int num_sig_sbb_s = _mm_extract_epi32(num_sig_sbb, 0);
       memcpy(&state->m_numSigSbb[state_offset], &num_sig_sbb_s, 4);
 
-      int32_t prev_state_scalar[4];
-      _mm_storeu_si128((__m128i*)prev_state_scalar, prev_state);
-      for (int i = 0; i < 4; ++i) {
-        memcpy(state->m_absLevels[state_offset + i], state->m_absLevels[prev_state_scalar[i]], 16 * sizeof(uint8_t));
+      __m128i temp_prev_state = _mm_shuffle_epi8(prev_state_no_offset, control);
+      __m256i prev_state_256 = _mm256_castsi128_si256(temp_prev_state);
+      prev_state_256 = _mm256_permute4x64_epi64(prev_state_256, 0);
+      __m256i temp_add = _mm256_setr_epi32(0, 0x04040404, 0x08080808, 0x0c0c0c0c, 0, 0x04040404, 0x08080808, 0x0c0c0c0c);
+      prev_state_256 = _mm256_add_epi8(prev_state_256, temp_add);
+      for (int i = 0; i < 64; i += (256 / (8 * sizeof(uint8_t)))) {
+        __m256i data = _mm256_load_si256((__m256i*)&state->m_absLevels[ctxs->m_prev_state_offset >> 2][i]);
+        data = _mm256_shuffle_epi8(data, prev_state_256);
+        _mm256_store_si256((__m256i*)&state->m_absLevels[ctxs->m_curr_state_offset >> 2][i], data);
       }
     } else {
       int prev_state_s[4] = {-1, -1, -1, -1};
@@ -540,27 +544,31 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
         if (decisions->prevId[decision_id] >= 4) {
           prev_state_s[i] = ctxs->m_skip_state_offset + (decisions->prevId[decision_id] - 4);
           state->m_numSigSbb[curr_state_offset] = 0;
-          memset(state->m_absLevels[curr_state_offset], 0, 16 * sizeof(uint8_t));
+          for (int j = i; j < 64; j += 4) {
+            state->m_absLevels[curr_state_offset >> 2][j] = 0;
+          }
         } else if (decisions->prevId[decision_id] >= 0) {
           prev_state_s[i] = ctxs->m_prev_state_offset + decisions->prevId[decision_id];
           state->m_numSigSbb[curr_state_offset] = state->m_numSigSbb[prev_state_s[i]] || !!decisions->absLevel[decision_id];
-          memcpy(state->m_absLevels[curr_state_offset], state->m_absLevels[prev_state_s[i]], 16 * sizeof(uint8_t));
+          for (int j = 0; j < 64; j += 4) {
+            state->m_absLevels[curr_state_offset >> 2][j + i] = state->m_absLevels[ctxs->m_prev_state_offset >> 2][j + decisions->prevId[decision_id]];
+          }
         } else {
           state->m_numSigSbb[curr_state_offset] = 1;
-          memset(state->m_absLevels[curr_state_offset], 0, 16 * sizeof(uint8_t));
+          for (int j = i; j < 64; j += 4) {
+            state->m_absLevels[curr_state_offset >> 2][j] = 0;
+          }
           all_have_previous_state = false;
         }
       }
       prev_state = _mm_loadu_si128((__m128i const*)prev_state_s);
     }
     uint32_t level_offset = scan_pos & 15;
-    __m128i  max_abs = _mm_min_epi32(abs_level, _mm_set1_epi32(32));
-    uint32_t max_abs_s[4];
-    _mm_storeu_si128((__m128i*)max_abs_s, max_abs);
-    for (int i = 0; i < 4; ++i) {
-      uint8_t* levels      = (uint8_t*)state->m_absLevels[state_offset + i];
-      levels[level_offset] = max_abs_s[i];
-    }
+    __m128i  max_abs = _mm_min_epi32(abs_level, _mm_set1_epi32(51));
+    max_abs = _mm_shuffle_epi8(max_abs, control);
+    uint32_t packed_max_abs = _mm_extract_epi32(max_abs, 0);
+    memcpy(&state->m_absLevels[state_offset >> 2][level_offset * 4], &packed_max_abs, 4);
+
 
     // Update common context
     __m128i last;
@@ -571,31 +579,40 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
       int previous_state_array[4];
       _mm_storeu_si128((__m128i*)previous_state_array, prev_state);
       for (int curr_state = 0; curr_state < 4; ++curr_state) {
-        uint8_t* sbbFlags = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset + (curr_state)].sbbFlags;
-        uint8_t* levels = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset + (curr_state)].levels;
+        uint8_t* sbbFlags = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset ].sbbFlags;
+        uint8_t* levels = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].levels;
         const int p_state = previous_state_array[curr_state];
         if (p_state != -1 && ctxs->m_allStates.m_refSbbCtxId[p_state] >= 0) {
-          const int prev_sbb = cc->m_prev_sbb_ctx_offset + ctxs->m_allStates.m_refSbbCtxId[p_state];
-          memcpy(sbbFlags, cc->m_allSbbCtx[prev_sbb].sbbFlags, numSbb * sizeof(uint8_t));
-          memcpy(levels + scan_pos, cc->m_allSbbCtx[prev_sbb].levels + scan_pos, setCpSize);
+          const int prev_sbb = ctxs->m_allStates.m_refSbbCtxId[p_state];
+          for (int i = 0; i < numSbb; ++i) {
+            sbbFlags[i * 4 + curr_state] = cc->m_allSbbCtx[cc->m_prev_sbb_ctx_offset].sbbFlags[i * 4 + prev_sbb];
+          }
+          for (int i = 16; i < setCpSize; ++i) {
+            levels[scan_pos * 4 + i * 4 + curr_state] = cc->m_allSbbCtx[cc->m_prev_sbb_ctx_offset].levels[scan_pos * 4 + i * 4 + prev_sbb];
+          }
         } else {
-          memset(sbbFlags, 0, numSbb * sizeof(uint8_t));
-          memset(levels + scan_pos, 0, setCpSize);
+          for (int i = 0; i < numSbb; ++i) {
+            sbbFlags[i * 4 + curr_state] = 0;
+          }
+          for (int i = 16; i < setCpSize; ++i) {
+            levels[scan_pos * 4 + i * 4 + curr_state] = 0;
+          }
+        }
+        sbbFlags[cg_pos * 4 + curr_state] = ctxs->m_allStates.m_numSigSbb[curr_state + state_offset];
+        for (int i = 0; i < 16; ++i) {
+          levels[scan_pos * 4 + i * 4 + curr_state] = ctxs->m_allStates.m_absLevels[state_offset / 4][i * 4 + curr_state];
         }
-        sbbFlags[cg_pos] = ctxs->m_allStates.m_numSigSbb[curr_state + state_offset];
-        memcpy(levels + scan_pos, ctxs->m_allStates.m_absLevels[curr_state + state_offset], 16 * sizeof(uint8_t));
       }
-
-      __m128i sbb_offsets = _mm_set_epi32(3 * numSbb, 2 * numSbb, 1 * numSbb, 0);
-      __m128i next_sbb_right_m = _mm_set1_epi32(next_sbb_right);
-      __m128i sbb_offsets_right = _mm_add_epi32(sbb_offsets, next_sbb_right_m);
-      __m128i sbb_right = next_sbb_right ? _mm_i32gather_epi32((const int *)cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].sbbFlags, sbb_offsets_right, 1) : _mm_set1_epi32(0);
-
-      __m128i sbb_offsets_below = _mm_add_epi32(sbb_offsets, _mm_set1_epi32(next_sbb_below));
-      __m128i sbb_below = next_sbb_below ? _mm_i32gather_epi32((const int *)cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].sbbFlags, sbb_offsets_below, 1) : _mm_set1_epi32(0);
+      
+      __m128i sbb_right = next_sbb_right ?
+          _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)&cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].sbbFlags[next_sbb_right * 4])) :
+          _mm_set1_epi32(0);
+      
+      __m128i sbb_below = next_sbb_below ?
+        _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)&cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].sbbFlags[next_sbb_below * 4])) :
+        _mm_set1_epi32(0);
 
       __m128i sig_sbb = _mm_or_si128(sbb_right, sbb_below);
-      sig_sbb         = _mm_and_si128(sig_sbb, _mm_set1_epi32(0xff));
       sig_sbb = _mm_min_epi32(sig_sbb, _mm_set1_epi32(1));
       __m256i sbb_frac_bits = _mm256_i32gather_epi64((int64_t *)cc->m_sbbFlagBits[0], sig_sbb, 8);
       _mm256_store_si256((__m256i*)state->m_sbbFracBits[state_offset], sbb_frac_bits);
@@ -621,7 +638,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
       
       const int        scanBeg = scan_pos - 16;
       const NbInfoOut* nbOut = cc->m_nbInfo + scanBeg;
-      const uint8_t*   absLevels = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].levels + scanBeg;
+      const uint8_t*   absLevels = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].levels + scanBeg * 4;
 
       __m128i          levels_offsets = _mm_set_epi32(cc->num_coeff * 3, cc->num_coeff * 2, cc->num_coeff * 1, 0);
       __m128i          first_byte = _mm_set1_epi32(0xff);
@@ -629,8 +646,6 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
       __m128i         fours = _mm_set1_epi32(4);
       __m256i          all[4];
       uint64_t         temp[4];
-      const __m256i v_shuffle = _mm256_set_epi8(15, 14,  7,  6, 13, 12,  5,  4, 11, 10,  3,  2,  9,  8,  1,  0,
-                                                31, 30, 23, 22, 29, 28, 21, 20, 27, 26, 19, 18, 25, 24, 17, 16);
 
       for (int id = 0; id < 16; id++, nbOut++) {
         if (nbOut->num == 0) {
@@ -646,9 +661,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
         switch (nbOut->num) {
         case 5:
           {
-            __m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[4]));
-            __m128i t = _mm_i32gather_epi32((const int *)absLevels, offset, 1);
-            t = _mm_and_si128(t, first_byte);
+            __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&absLevels[nbOut->outPos[4] * 4])));
             sum_abs = _mm_add_epi32(sum_abs, t);
             sum_num   = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
             __m128i min_t = _mm_min_epi32(
@@ -661,9 +674,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
             sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t);
           }
         case 4: {
-            __m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[3]));
-            __m128i t     = _mm_i32gather_epi32((const int*)absLevels, offset, 1);
-            t = _mm_and_si128(t, first_byte);
+            __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&absLevels[nbOut->outPos[3] * 4])));
             sum_abs = _mm_add_epi32(sum_abs, t);
             sum_num   = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
             __m128i min_t = _mm_min_epi32(
@@ -674,9 +685,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
             sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t);
         }
         case 3: {
-            __m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[2]));
-            __m128i t     = _mm_i32gather_epi32((const int*)absLevels, offset, 1);
-            t = _mm_and_si128(t, first_byte);
+            __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&absLevels[nbOut->outPos[2] * 4])));
             sum_abs = _mm_add_epi32(sum_abs, t);
             sum_num   = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
             __m128i min_t = _mm_min_epi32(
@@ -687,9 +696,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
             sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t);
         }
         case 2: {
-            __m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[1]));
-            __m128i t     = _mm_i32gather_epi32((const int*)absLevels, offset, 1);
-            t = _mm_and_si128(t, first_byte);
+            __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&absLevels[nbOut->outPos[1] * 4])));
             sum_abs = _mm_add_epi32(sum_abs, t);
             sum_num   = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
             __m128i min_t = _mm_min_epi32(
@@ -700,9 +707,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
             sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t);
         }
         case 1: {
-            __m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[0]));
-            __m128i t = _mm_i32gather_epi32((const int*)absLevels, offset, 1);
-            t = _mm_and_si128(t, first_byte);
+            __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&absLevels[nbOut->outPos[0] * 4])));
             sum_abs = _mm_add_epi32(sum_abs, t);
             sum_num = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
             __m128i min_t = _mm_min_epi32(
@@ -735,7 +740,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
       _mm256_storeu_si256((__m256i*)(&state->m_ctxInit[state_offset >> 2][48]), all[3]);
 
       for (int i = 0; i < 4; ++i) {
-        memset(state->m_absLevels[state_offset + i], 0, 16);
+        memset(state->m_absLevels[state_offset >> 2], 0, 16 * 4);
       }
     }
 
@@ -811,13 +816,13 @@ static INLINE void update_states_avx2(
 
     bool    rem_reg_all_gte_4 = true;
     bool    rem_reg_all_lt4 = true;
+    __m128i control = _mm_setr_epi8(0, 4, 8, 12, 0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1);
 
     __m128i abs_level = _mm_load_si128((__m128i const*)decisions->absLevel);
     if (all_non_negative) {
       __m128i prv_states_o  = _mm_load_si128((__m128i const*)decisions->prevId);
       __m128i prev_offset = _mm_set1_epi32(ctxs->m_prev_state_offset);
       __m128i prv_states     = _mm_add_epi32(prv_states_o, prev_offset);
-      __m128i control = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
       __m128i shuffled_prev_states = _mm_shuffle_epi8(prv_states, control);
       
       __m128i sig_sbb   = _mm_load_si128((__m128i const*)state->m_numSigSbb);
@@ -862,17 +867,32 @@ static INLINE void update_states_avx2(
       bit_mask = _mm_movemask_epi8(mask); 
       rem_reg_all_lt4 = (bit_mask == 0xFFFF);
 
-      int32_t prv_states_scalar[4];
-      _mm_storeu_si128((__m128i*)prv_states_scalar, prv_states);
-      for (int i = 0; i < 4; ++i) {
-        memcpy(state->m_absLevels[state_offset + i], state->m_absLevels[prv_states_scalar[i]], 16 * sizeof(uint8_t));       
+
+      __m128i temp_prev_state = _mm_shuffle_epi8(prv_states_o, control);
+      __m256i prev_state_256  = _mm256_castsi128_si256(temp_prev_state);
+      prev_state_256          = _mm256_permute4x64_epi64(prev_state_256, 0);
+      __m256i temp_add        = _mm256_setr_epi32(
+        0,
+        0x04040404,
+        0x08080808,
+        0x0c0c0c0c,
+        0,
+        0x04040404,
+        0x08080808,
+        0x0c0c0c0c);
+      prev_state_256 = _mm256_add_epi8(prev_state_256, temp_add);
+      for (int i = 0; i < 64; i += (256 / (8 * sizeof(uint8_t)))) {
+        __m256i data = _mm256_load_si256((__m256i*)&state->m_absLevels[ctxs->m_prev_state_offset >> 2][i]);
+        data = _mm256_shuffle_epi8(data, prev_state_256);
+        _mm256_store_si256((__m256i*)&state->m_absLevels[ctxs->m_curr_state_offset >> 2][i], data);
       }
+
       __m256i prev_state_full = _mm256_load_si256((__m256i const*)decisions->prevId);
       __m256i shuffle_mask = _mm256_setr_epi8(0, 0, 4, 4,8, 8, 12, 12, 0, 0, 4, 4, 8, 8, 12, 12,0, 0, 0, 0,0, 0, 0, 0,16, 16, 16, 16, 16, 16, 16, 16);
       prev_state_full = _mm256_shuffle_epi8(prev_state_full, shuffle_mask);
       prev_state_full  = _mm256_permute4x64_epi64(prev_state_full, 0);
       prev_state_full  = _mm256_slli_epi16(prev_state_full, 1);
-      __m256i temp_add = _mm256_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 8, 9, 8, 9, 8, 9, 16, 17, 16, 17,16, 17,16, 17, 24, 25,24,25,24,25,24,25);
+      temp_add = _mm256_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 8, 9, 8, 9, 8, 9, 16, 17, 16, 17,16, 17,16, 17, 24, 25,24,25,24,25,24,25);
       prev_state_full = _mm256_add_epi8(prev_state_full, temp_add);
 
       for (int i = 0; i < 64; i += (256 / 8 / sizeof(uint16_t))) {
@@ -903,7 +923,7 @@ static INLINE void update_states_avx2(
       bit_mask = _mm_movemask_epi8(mask);
       rem_reg_all_lt4 = (bit_mask == 0xFFFF);
       
-      memset(state->m_absLevels[state_offset], 0, 16 * sizeof(uint8_t) * 4);
+      memset(state->m_absLevels[state_offset >> 2], 0, 16 * sizeof(uint8_t) * 4);
       memset(state->m_ctxInit[state_offset >> 2], 0, 16 * sizeof(uint16_t) * 4);
       
     }
@@ -922,35 +942,36 @@ static INLINE void update_states_avx2(
           if (state->m_remRegBins[state_id] >= 4) {
             state->m_remRegBins[state_id] -= (decisions->absLevel[decision_id] < 2 ? (unsigned)decisions->absLevel[decision_id] : 3);
           }
-          memcpy(state->m_absLevels[state_id], state->m_absLevels[prvState], 16 * sizeof(uint8_t));
           for (int k = 0; k < 16; ++k) {
             state->m_ctxInit[state_offset >> 2][k * 4 + i] = state->m_ctxInit[ctxs->m_prev_state_offset >> 2][k * 4 + decisions->prevId[decision_id]];
           }
+          for (int k = 0; k < 16; ++k) {
+            state->m_absLevels[state_offset >> 2][k * 4 + i] = state->m_absLevels[ctxs->m_prev_state_offset >> 2][k * 4 + decisions->prevId[decision_id]];
+          }
         } else {
           state->m_numSigSbb[state_id] = 1;
           state->m_refSbbCtxId[state_id] = -1;
           int ctxBinSampleRatio = 28;
           //(scanInfo.chType == CHANNEL_TYPE_LUMA) ? MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_LUMA : MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_CHROMA;
           state->m_remRegBins[state_id] = (state->effWidth * state->effHeight * ctxBinSampleRatio) / 16 - (decisions->absLevel[decision_id] < 2 ? (unsigned)decisions->absLevel[decision_id] : 3);
-          memset(state->m_absLevels[state_id], 0, 16 * sizeof(uint8_t));
-          for (int k = 0; k < 16; ++k) {
-            state->m_ctxInit[state_offset >> 2][k * 4 + i] = 0;
+          for (int k = i; k < 64; k += 4) {
+            state->m_ctxInit[state_offset >> 2][k] = 0;
+            state->m_absLevels[state_offset >> 2][k] = 0;
           }
         }
         rem_reg_all_gte_4 &= state->m_remRegBins[state_id] >= 4;
         rem_reg_all_lt4 &= state->m_remRegBins[state_id] < 4;
       }
     }
-    uint32_t level_offset = scan_pos & 15;
-    __m128i   max_abs = _mm_min_epi32(abs_level, _mm_set1_epi32(51));
-    uint32_t max_abs_s[4];
-    _mm_storeu_si128((__m128i*)max_abs_s, max_abs);
-    for (int i = 0; i < 4; ++i) {
-      uint8_t* levels = (uint8_t*)state->m_absLevels[state_offset + i];
-      levels[level_offset] = max_abs_s[i];
-    }
+    uint32_t level_offset   = scan_pos & 15;
+    __m128i  max_abs        = _mm_min_epi32(abs_level, _mm_set1_epi32(51));
+    max_abs                 = _mm_shuffle_epi8(max_abs, control);
+    uint32_t packed_max_abs = _mm_extract_epi32(max_abs, 0);
+    memcpy(&state->m_absLevels[state_offset >> 2][level_offset * 4], &packed_max_abs,4);
+
     state->all_gte_four = rem_reg_all_gte_4;
     state->all_lt_four = rem_reg_all_lt4;
+
     if (rem_reg_all_gte_4) {
       const __m128i  first_byte = _mm_set1_epi32(0xff);
       const __m128i  ones = _mm_set1_epi32(1);
@@ -961,15 +982,11 @@ static INLINE void update_states_avx2(
       __m128i sum_abs1 = _mm_and_si128(_mm_srli_epi32(tinit, 3), _mm_set1_epi32(31));
       __m128i sum_num = _mm_and_si128(tinit, _mm_set1_epi32(7));
 
-      uint8_t* levels = (uint8_t*)state->m_absLevels[state_offset];
+      uint8_t* levels = (uint8_t*)state->m_absLevels[state_offset >> 2];
       switch (numIPos) {
       case 5:
         {
-          __m128i t = _mm_i32gather_epi32(
-            (int *)levels,
-            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[4])),
-            1);
-          t = _mm_and_si128(t, first_byte);
+          __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[4] * 4])));
           __m128i min_arg = _mm_min_epi32(
             _mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)),
             t
@@ -984,11 +1001,7 @@ static INLINE void update_states_avx2(
         }
       case 4:
         {
-          __m128i t = _mm_i32gather_epi32(
-            (int*)levels,
-            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[3])),
-            1);
-          t = _mm_and_si128(t, first_byte);
+          __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[3] * 4])));
           __m128i min_arg = _mm_min_epi32(
             _mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)),
             t
@@ -1001,11 +1014,7 @@ static INLINE void update_states_avx2(
         }
       case 3:
         {
-          __m128i t = _mm_i32gather_epi32(
-            (int*)levels,
-            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[2])),
-            1);
-          t = _mm_and_si128(t, first_byte);
+          __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[2] * 4])));
           __m128i min_arg = _mm_min_epi32(
             _mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)),
             t
@@ -1018,11 +1027,7 @@ static INLINE void update_states_avx2(
         }
       case 2:
         {
-          __m128i t = _mm_i32gather_epi32(
-            (int*)levels,
-            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[1])),
-            1);
-          t = _mm_and_si128(t, first_byte);
+          __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[1] * 4])));
         __m128i min_arg = _mm_min_epi32(
               _mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)),
               t
@@ -1034,11 +1039,7 @@ static INLINE void update_states_avx2(
           sum_num = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
         }
       case 1: {
-          __m128i t = _mm_i32gather_epi32(
-            (int*)levels,
-            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[0])),
-            1);
-          t = _mm_and_si128(t, first_byte);
+          __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[0] * 4])));
           __m128i min_arg = _mm_min_epi32(
             _mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)),
             t
@@ -1075,51 +1076,32 @@ static INLINE void update_states_avx2(
       switch (numIPos) {
         case 5:
           {
-          __m128i t = _mm_i32gather_epi32(
-            (int*)levels,
-            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[4])),
-            1);
+          __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[4] * 4])));
           sum_abs = _mm_add_epi32(t, sum_abs);
-          // Need this to make sure we don't go beyond 255
-          sum_abs = _mm_and_si128(sum_abs, first_byte);
-          sum_abs = _mm_min_epi32(sum_abs, _mm_set1_epi32(51));
           }
         case 4:
           {
-          __m128i t = _mm_i32gather_epi32(
-            (int*)levels,
-            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[3])),
-            1);
+          __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[3] * 4])));
           sum_abs = _mm_add_epi32(t, sum_abs);
           }
         case 3:
           {
-          __m128i t = _mm_i32gather_epi32(
-            (int*)levels,
-            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[2])),
-            1);
+          __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[2] * 4])));
           sum_abs = _mm_add_epi32(t, sum_abs);
           }
         case 2:
           {
-          __m128i t = _mm_i32gather_epi32(
-            (int*)levels,
-            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[1])),
-            1);
+          __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[1] * 4])));
           sum_abs = _mm_add_epi32(t, sum_abs);
           }
         case 1:
           {
-          __m128i t = _mm_i32gather_epi32(
-            (int*)levels,
-            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[0])),
-            1);
+          __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[0] * 4])));
           sum_abs = _mm_add_epi32(t, sum_abs);
           } break;
         default:
           assert(0);
       }
-      sum_abs = _mm_and_si128(sum_abs, first_byte);
       if (extRiceFlag) {
         assert(0 && "Not implemented for avx2");
       } else {
@@ -1138,7 +1120,7 @@ static INLINE void update_states_avx2(
 
     else if (rem_reg_all_lt4) {
       const __m128i first_byte = _mm_set1_epi32(0xff);
-      uint8_t*       levels = (uint8_t*)state->m_absLevels[state_offset];
+      uint8_t*       levels = (uint8_t*)state->m_absLevels[state_offset >> 2];
       const __m128i  last_byte = _mm_set1_epi32(0xff);
       const uint32_t tinit_offset = MIN(level_offset - 1u, 15u);
       const __m128i levels_start_offsets = _mm_set_epi32(16 * 3, 16 * 2, 16 * 1, 16 * 0);
@@ -1147,48 +1129,34 @@ static INLINE void update_states_avx2(
       __m128i sum_abs = _mm_srli_epi32(tinit, 8);
       sum_abs         = _mm_min_epi32(sum_abs, _mm_set1_epi32(51));
       switch (numIPos) {
-        case 5: {
-          __m128i t = _mm_i32gather_epi32(
-            (int*)levels,
-            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[4])),
-            1);
-          sum_abs = _mm_add_epi32(sum_abs, t);
-          // Need this to make sure we don't go beyond 255
-          sum_abs = _mm_and_si128(sum_abs, first_byte);
-          sum_abs = _mm_min_epi32(sum_abs, _mm_set1_epi32(51));
-        }
-        case 4: {
-          __m128i t = _mm_i32gather_epi32(
-            (int*)levels,
-            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[3])),
-            1);
-          sum_abs = _mm_add_epi32(sum_abs, t);
-        }
-        case 3: {
-          __m128i t = _mm_i32gather_epi32(
-            (int*)levels,
-            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[2])),
-            1);
-          sum_abs = _mm_add_epi32(sum_abs, t);
-        }
-        case 2: {
-          __m128i t = _mm_i32gather_epi32(
-            (int*)levels,
-            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[1])),
-            1);
-          sum_abs = _mm_add_epi32(sum_abs, t);
-        }
-        case 1: {
-          __m128i t = _mm_i32gather_epi32(
-            (int*)levels,
-            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[0])),
-            1);
-          sum_abs = _mm_add_epi32(sum_abs, t);
-        } break;
+        case 5:
+          {
+          __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[4] * 4])));
+          sum_abs = _mm_add_epi32(t, sum_abs);
+          }
+        case 4:
+          {
+          __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[3] * 4])));
+          sum_abs = _mm_add_epi32(t, sum_abs);
+          }
+        case 3:
+          {
+          __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[2] * 4])));
+          sum_abs = _mm_add_epi32(t, sum_abs);
+          }
+        case 2:
+          {
+          __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[1] * 4])));
+          sum_abs = _mm_add_epi32(t, sum_abs);
+          }
+        case 1:
+          {
+          __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[0] * 4])));
+          sum_abs = _mm_add_epi32(t, sum_abs);
+          } break;
         default:
           assert(0);
       }
-      sum_abs = _mm_and_si128(sum_abs, last_byte);
       if (extRiceFlag) {
         assert(0 && "Not implemented for avx2");
       } else {
@@ -1209,14 +1177,14 @@ static INLINE void update_states_avx2(
     else {
       for (int i = 0; i < 4; ++i) {
         const int state_id = state_offset + i;
-        uint8_t*  levels = (uint8_t*)(state->m_absLevels[state_id]);
+        uint8_t*  levels = (uint8_t*)(state->m_absLevels[state_offset >> 2]);
         if (state->m_remRegBins[state_id] >= 4) {
           coeff_t tinit = state->m_ctxInit[state_offset >> 2][((scan_pos - 1) & 15) * 4 + i];
           coeff_t sumAbs1 = (tinit >> 3) & 31;
           coeff_t sumNum = tinit & 7;
 #define UPDATE(k)                                  \
   {                                                \
-    coeff_t t = levels[next_nb_info_ssb.inPos[k]]; \
+    coeff_t t = levels[next_nb_info_ssb.inPos[k] * 4 + i]; \
     sumAbs1 += MIN(4 + (t & 1), t);                \
     sumNum += !!t;                                 \
   }
@@ -1238,7 +1206,7 @@ static INLINE void update_states_avx2(
           coeff_t sumAbs = state->m_ctxInit[state_offset >> 2][((scan_pos - 1) & 15) * 4 + i] >> 8;
 #define UPDATE(k)                                  \
   {                                                \
-    coeff_t t = levels[next_nb_info_ssb.inPos[k]]; \
+    coeff_t t = levels[next_nb_info_ssb.inPos[k] * 4 + i]; \
     sumAbs += t;                                   \
   }
           switch (numIPos) {
@@ -1260,7 +1228,7 @@ static INLINE void update_states_avx2(
           coeff_t sumAbs = (state->m_ctxInit[state_offset >> 2][((scan_pos - 1) & 15) * 4 + i]) >> 8;
 #define UPDATE(k)                                  \
   {                                                \
-    coeff_t t = levels[next_nb_info_ssb.inPos[k]]; \
+    coeff_t t = levels[next_nb_info_ssb.inPos[k] * 4 + i]; \
     sumAbs += t;                                   \
   }
           switch (numIPos) {
@@ -1345,7 +1313,7 @@ void uvg_dep_quant_decide_and_update_avx2(
     //  for (int k = 0; k < 16; ++k) {
     //    printf(
     //      "%3d ",
-    //      ctxs->m_allStates.m_ctxInit[ctxs->m_curr_state_offset / 4][k * 4 + i]);
+    //      ctxs->m_allStates.m_absLevels[ctxs->m_curr_state_offset / 4][k * 4 + i]);
     //  }
     //  printf("\n");
     //}

From bc246013695f56a2190738e839abf1ab08738479 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Tue, 9 May 2023 11:28:23 +0300
Subject: [PATCH 236/254] [avx2] Improve avx2 version of update_common_context

---
 src/strategies/avx2/depquant-avx2.c | 124 ++++++++++++++++++++--------
 1 file changed, 91 insertions(+), 33 deletions(-)

diff --git a/src/strategies/avx2/depquant-avx2.c b/src/strategies/avx2/depquant-avx2.c
index 1a00be56..cacee3fd 100644
--- a/src/strategies/avx2/depquant-avx2.c
+++ b/src/strategies/avx2/depquant-avx2.c
@@ -497,7 +497,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
     __m128i prev_state;
     __m128i prev_state_no_offset;
     __m128i abs_level = _mm_load_si128((const __m128i*)decisions->absLevel);
-    __m128i control = _mm_setr_epi8(0, 4, 8, 12, 0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1);
+    __m128i control = _mm_setr_epi8(0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12);
     if (all_above_four) {
       prev_state = _mm_set1_epi32(ctxs->m_skip_state_offset);
       prev_state_no_offset = _mm_sub_epi32(_mm_load_si128((const __m128i*)decisions->prevId), _mm_set1_epi32(4));
@@ -575,34 +575,101 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
     {
       const uint32_t numSbb = width_in_sbb * height_in_sbb;
       common_context* cc = &ctxs->m_common_context;
-      size_t         setCpSize = cc->m_nbInfo[scan_pos - 1].maxDist * sizeof(uint8_t);
-      int previous_state_array[4];
+      size_t   setCpSize = cc->m_nbInfo[scan_pos - 1].maxDist * sizeof(uint8_t);
+      uint8_t* sbbFlags  = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].sbbFlags;
+      uint8_t* levels   = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].levels + scan_pos * 4;
+      uint8_t* levels_in = cc->m_allSbbCtx[cc->m_prev_sbb_ctx_offset].levels + scan_pos * 4;
+      int      previous_state_array[4];
       _mm_storeu_si128((__m128i*)previous_state_array, prev_state);
-      for (int curr_state = 0; curr_state < 4; ++curr_state) {
-        uint8_t* sbbFlags = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset ].sbbFlags;
-        uint8_t* levels = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].levels;
-        const int p_state = previous_state_array[curr_state];
-        if (p_state != -1 && ctxs->m_allStates.m_refSbbCtxId[p_state] >= 0) {
-          const int prev_sbb = ctxs->m_allStates.m_refSbbCtxId[p_state];
-          for (int i = 0; i < numSbb; ++i) {
-            sbbFlags[i * 4 + curr_state] = cc->m_allSbbCtx[cc->m_prev_sbb_ctx_offset].sbbFlags[i * 4 + prev_sbb];
-          }
-          for (int i = 16; i < setCpSize; ++i) {
-            levels[scan_pos * 4 + i * 4 + curr_state] = cc->m_allSbbCtx[cc->m_prev_sbb_ctx_offset].levels[scan_pos * 4 + i * 4 + prev_sbb];
+
+      if (all_have_previous_state) {
+        __m128i temp_p_state = _mm_shuffle_epi8(prev_state, control);
+        __m128i ref_sbb_ctx_offset =
+          _mm_load_si128((__m128i*)ctxs->m_allStates.m_refSbbCtxId);
+        ref_sbb_ctx_offset = _mm_shuffle_epi8(ref_sbb_ctx_offset, temp_p_state);
+        if (numSbb <= 4) {
+          __m128i incremented_ref_sbb_ctx_offset = _mm_add_epi8(
+            ref_sbb_ctx_offset,
+            _mm_setr_epi8(0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12)
+          );
+          __m128i blend_mask = _mm_cmpeq_epi8(ref_sbb_ctx_offset, _mm_set1_epi32(0xffffffff));
+          __m128i sbb_flags = _mm_loadu_si128((__m128i*)cc->m_allSbbCtx[cc->m_prev_sbb_ctx_offset].sbbFlags);
+          sbb_flags = _mm_shuffle_epi8(sbb_flags, incremented_ref_sbb_ctx_offset);
+          sbb_flags = _mm_blendv_epi8(sbb_flags, _mm_set1_epi64x(0), blend_mask);
+          if (numSbb == 2) {
+            uint64_t temp = _mm_extract_epi64(sbb_flags, 0);
+            memcpy(sbbFlags, &temp, 8);
+          } else {
+            _mm_storeu_si128((__m128i*)sbbFlags, sbb_flags);
           }
         } else {
-          for (int i = 0; i < numSbb; ++i) {
-            sbbFlags[i * 4 + curr_state] = 0;
-          }
-          for (int i = 16; i < setCpSize; ++i) {
-            levels[scan_pos * 4 + i * 4 + curr_state] = 0;
+          __m256i extended_ref_state = _mm256_zextsi128_si256(ref_sbb_ctx_offset);
+          extended_ref_state = _mm256_permute4x64_epi64(extended_ref_state, 0);
+          __m256i inc_ref_state = _mm256_add_epi8(
+            extended_ref_state,
+            _mm256_setr_epi32(0, 0x04040404, 0x08080808, 0x0c0c0c0c,0, 0x04040404, 0x08080808, 0x0c0c0c0c)
+          );
+          __m256i blend_mask = _mm256_cmpeq_epi8(extended_ref_state, _mm256_set1_epi32(0xffffffff));
+          inc_ref_state = _mm256_blendv_epi8(inc_ref_state, _mm256_set1_epi32(0xffffffff), blend_mask);
+          for (int i = 0; i < numSbb * 4; i += 32) {
+            __m256i sbb_flags = _mm256_loadu_si256((__m256i*)(&cc->m_allSbbCtx[cc->m_prev_sbb_ctx_offset].sbbFlags[i]));
+            sbb_flags = _mm256_shuffle_epi8(sbb_flags, inc_ref_state);
+            _mm256_store_si256((__m256i*)&sbbFlags[i], sbb_flags);
           }
         }
-        sbbFlags[cg_pos * 4 + curr_state] = ctxs->m_allStates.m_numSigSbb[curr_state + state_offset];
-        for (int i = 0; i < 16; ++i) {
-          levels[scan_pos * 4 + i * 4 + curr_state] = ctxs->m_allStates.m_absLevels[state_offset / 4][i * 4 + curr_state];
+        int levels_start = 16;
+        const uint64_t limit        = setCpSize & ~(8 - 1);
+        if (levels_start < limit) {
+          __m256i extended_ref_state = _mm256_zextsi128_si256(ref_sbb_ctx_offset);
+          extended_ref_state = _mm256_permute4x64_epi64(extended_ref_state, 0);
+          __m256i inc_ref_state = _mm256_add_epi8(
+            extended_ref_state,
+            _mm256_setr_epi32(0, 0x04040404, 0x08080808, 0x0c0c0c0c,0, 0x04040404, 0x08080808, 0x0c0c0c0c)
+          );
+          __m256i blend_mask = _mm256_cmpeq_epi8(extended_ref_state, _mm256_set1_epi32(0xffffffff));
+          inc_ref_state = _mm256_blendv_epi8(inc_ref_state, _mm256_set1_epi32(0xffffffff), blend_mask);
+          for (; levels_start < limit; levels_start += 8) {
+             __m256i levels_v = _mm256_loadu_si256((__m256i*)(&levels_in[levels_start * 4]));
+            levels_v = _mm256_shuffle_epi8(levels_v, inc_ref_state);
+             _mm256_store_si256((__m256i*)&levels[levels_start * 4], levels_v);
+          }
+        }
+        uint8_t ref_sbb[4];
+        int     temp_sbb_ref = _mm_extract_epi32(ref_sbb_ctx_offset, 0);
+        memcpy(ref_sbb, &temp_sbb_ref, 4);
+        for (;levels_start < setCpSize; ++levels_start) {
+          uint8_t new_values[4];
+          new_values[0] = ref_sbb[0] != 0xff ? levels_in[levels_start * 4 + ref_sbb[0]] : 0;
+          new_values[1] = ref_sbb[1] != 0xff ? levels_in[levels_start * 4 + ref_sbb[1]] : 0;
+          new_values[2] = ref_sbb[2] != 0xff ? levels_in[levels_start * 4 + ref_sbb[2]] : 0;
+          new_values[3] = ref_sbb[3] != 0xff ? levels_in[levels_start * 4 + ref_sbb[3]] : 0;
+          memcpy(&levels[levels_start * 4], new_values, 4);
+        }
+
+      }
+      else {
+        for (int curr_state = 0; curr_state < 4; ++curr_state) {
+          const int p_state = previous_state_array[curr_state];
+          if (p_state != -1 && ctxs->m_allStates.m_refSbbCtxId[p_state] >= 0) {
+            const int prev_sbb = ctxs->m_allStates.m_refSbbCtxId[p_state];
+            for (int i = 0; i < numSbb; ++i) {
+              sbbFlags[i * 4 + curr_state] = cc->m_allSbbCtx[cc->m_prev_sbb_ctx_offset].sbbFlags[i * 4 + prev_sbb];
+            }
+            for (int i = 16; i < setCpSize; ++i) {
+              levels[i * 4 + curr_state] = levels_in[i * 4 + prev_sbb];
+            }
+          } else {
+            for (int i = 0; i < numSbb; ++i) {
+              sbbFlags[i * 4 + curr_state] = 0;
+            }
+            for (int i = 16; i < setCpSize; ++i) {
+              levels[ i * 4 + curr_state] = 0;
+            }
+          }
         }
       }
+      memcpy(levels, ctxs->m_allStates.m_absLevels[state_offset / 4], 64);
+      memcpy(&sbbFlags[cg_pos * 4], &ctxs->m_allStates.m_numSigSbb[state_offset], 4);
       
       __m128i sbb_right = next_sbb_right ?
           _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)&cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].sbbFlags[next_sbb_right * 4])) :
@@ -640,8 +707,6 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
       const NbInfoOut* nbOut = cc->m_nbInfo + scanBeg;
       const uint8_t*   absLevels = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].levels + scanBeg * 4;
 
-      __m128i          levels_offsets = _mm_set_epi32(cc->num_coeff * 3, cc->num_coeff * 2, cc->num_coeff * 1, 0);
-      __m128i          first_byte = _mm_set1_epi32(0xff);
       __m128i          ones = _mm_set1_epi32(1);
       __m128i         fours = _mm_set1_epi32(4);
       __m256i          all[4];
@@ -738,10 +803,8 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
       _mm256_storeu_si256((__m256i*)(&state->m_ctxInit[state_offset >> 2][16]), all[1]);
       _mm256_storeu_si256((__m256i*)(&state->m_ctxInit[state_offset >> 2][32]), all[2]);
       _mm256_storeu_si256((__m256i*)(&state->m_ctxInit[state_offset >> 2][48]), all[3]);
-
-      for (int i = 0; i < 4; ++i) {
-        memset(state->m_absLevels[state_offset >> 2], 0, 16 * 4);
-      }
+      
+      memset(state->m_absLevels[state_offset >> 2], 0, 16 * 4);      
     }
 
     __m128i sum_num = _mm_and_si128(last, _mm_set1_epi32(7));
@@ -973,10 +1036,8 @@ static INLINE void update_states_avx2(
     state->all_lt_four = rem_reg_all_lt4;
 
     if (rem_reg_all_gte_4) {
-      const __m128i  first_byte = _mm_set1_epi32(0xff);
       const __m128i  ones = _mm_set1_epi32(1);
       const uint32_t tinit_offset = MIN(level_offset - 1u, 15u);
-      const __m128i levels_start_offsets = _mm_set_epi32(16 * 3, 16 * 2, 16 * 1, 16 * 0);
       __m128i   tinit = _mm_loadu_si128((__m128i*)(&state->m_ctxInit[state_offset >> 2][tinit_offset * 4]));
       tinit = _mm_cvtepi16_epi32(tinit); 
       __m128i sum_abs1 = _mm_and_si128(_mm_srli_epi32(tinit, 3), _mm_set1_epi32(31));
@@ -1119,11 +1180,8 @@ static INLINE void update_states_avx2(
     }
 
     else if (rem_reg_all_lt4) {
-      const __m128i first_byte = _mm_set1_epi32(0xff);
       uint8_t*       levels = (uint8_t*)state->m_absLevels[state_offset >> 2];
-      const __m128i  last_byte = _mm_set1_epi32(0xff);
       const uint32_t tinit_offset = MIN(level_offset - 1u, 15u);
-      const __m128i levels_start_offsets = _mm_set_epi32(16 * 3, 16 * 2, 16 * 1, 16 * 0);
       __m128i   tinit = _mm_loadu_si128((__m128i*)(&state->m_ctxInit[state_offset >> 2][tinit_offset * 4]));
       tinit = _mm_cvtepi16_epi32(tinit); 
       __m128i sum_abs = _mm_srli_epi32(tinit, 8);

From f2fb641acb9ae2a3dc6cc039d372ee1972a1a907 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 10 May 2023 09:25:58 +0300
Subject: [PATCH 237/254] [avx2] Replace inefficient loop with AVX2 code

---
 src/strategies/avx2/depquant-avx2.c | 52 +++++++++++++++++++++++------
 1 file changed, 42 insertions(+), 10 deletions(-)

diff --git a/src/strategies/avx2/depquant-avx2.c b/src/strategies/avx2/depquant-avx2.c
index cacee3fd..82ddd498 100644
--- a/src/strategies/avx2/depquant-avx2.c
+++ b/src/strategies/avx2/depquant-avx2.c
@@ -1005,26 +1005,58 @@ static INLINE void update_states_avx2(
           if (state->m_remRegBins[state_id] >= 4) {
             state->m_remRegBins[state_id] -= (decisions->absLevel[decision_id] < 2 ? (unsigned)decisions->absLevel[decision_id] : 3);
           }
-          for (int k = 0; k < 16; ++k) {
-            state->m_ctxInit[state_offset >> 2][k * 4 + i] = state->m_ctxInit[ctxs->m_prev_state_offset >> 2][k * 4 + decisions->prevId[decision_id]];
-          }
-          for (int k = 0; k < 16; ++k) {
-            state->m_absLevels[state_offset >> 2][k * 4 + i] = state->m_absLevels[ctxs->m_prev_state_offset >> 2][k * 4 + decisions->prevId[decision_id]];
-          }
         } else {
           state->m_numSigSbb[state_id] = 1;
           state->m_refSbbCtxId[state_id] = -1;
           int ctxBinSampleRatio = 28;
           //(scanInfo.chType == CHANNEL_TYPE_LUMA) ? MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_LUMA : MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_CHROMA;
           state->m_remRegBins[state_id] = (state->effWidth * state->effHeight * ctxBinSampleRatio) / 16 - (decisions->absLevel[decision_id] < 2 ? (unsigned)decisions->absLevel[decision_id] : 3);
-          for (int k = i; k < 64; k += 4) {
-            state->m_ctxInit[state_offset >> 2][k] = 0;
-            state->m_absLevels[state_offset >> 2][k] = 0;
-          }
         }
         rem_reg_all_gte_4 &= state->m_remRegBins[state_id] >= 4;
         rem_reg_all_lt4 &= state->m_remRegBins[state_id] < 4;
       }
+      {
+        __m256i prev_state_full = _mm256_load_si256((__m256i const*)decisions->prevId);
+        __m256i shuffle_mask = _mm256_setr_epi8(0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+        prev_state_full = _mm256_shuffle_epi8(prev_state_full, shuffle_mask);
+        prev_state_full  = _mm256_permute4x64_epi64(prev_state_full, 0);
+        __m256i temp_add        = _mm256_setr_epi32(
+          0,
+          0x04040404,
+          0x08080808,
+          0x0c0c0c0c,
+          0,
+          0x04040404,
+          0x08080808,
+          0x0c0c0c0c);
+        __m256i comp_mask = _mm256_cmpeq_epi8(prev_state_full, _mm256_set1_epi64x(-1));
+        prev_state_full = _mm256_add_epi8(prev_state_full, temp_add);
+        prev_state_full = _mm256_blendv_epi8(prev_state_full, _mm256_set1_epi64x(-1), comp_mask);
+        for (int i = 0; i < 64; i += (256 / (8 * sizeof(uint8_t)))) {
+          __m256i data = _mm256_load_si256((__m256i*)&state->m_absLevels[ctxs->m_prev_state_offset >> 2][i]);
+          data = _mm256_shuffle_epi8(data, prev_state_full);
+          _mm256_store_si256((__m256i*)&state->m_absLevels[ctxs->m_curr_state_offset >> 2][i], data);
+        }
+      }
+
+      {
+        __m256i prev_state_full = _mm256_load_si256((__m256i const*)decisions->prevId);
+        __m256i shuffle_mask = _mm256_setr_epi8(0, 0, 4, 4,8, 8, 12, 12, 0, 0, 4, 4, 8, 8, 12, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+        prev_state_full = _mm256_shuffle_epi8(prev_state_full, shuffle_mask);
+        prev_state_full  = _mm256_permute4x64_epi64(prev_state_full, 0);
+        __m256i comp_mask = _mm256_cmpeq_epi8(prev_state_full, _mm256_set1_epi64x(-1));
+        prev_state_full  = _mm256_slli_epi16(prev_state_full, 1);
+        __m256i temp_add = _mm256_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 8, 9, 8, 9, 8, 9, 16, 17, 16, 17,16, 17,16, 17, 24, 25,24,25,24,25,24,25);
+
+        prev_state_full = _mm256_add_epi8(prev_state_full, temp_add);
+        prev_state_full = _mm256_blendv_epi8(prev_state_full, _mm256_set1_epi64x(-1), comp_mask);
+
+        for (int i = 0; i < 64; i += (256 / 8 / sizeof(uint16_t))) {
+          __m256i data   = _mm256_load_si256((__m256i*)(&state->m_ctxInit[(ctxs->m_prev_state_offset >> 2)][i]));
+          data = _mm256_shuffle_epi8(data, prev_state_full);
+          _mm256_store_si256((__m256i*)(&state->m_ctxInit[(state_offset >> 2)][i]), data);
+        }
+      }
     }
     uint32_t level_offset   = scan_pos & 15;
     __m128i  max_abs        = _mm_min_epi32(abs_level, _mm_set1_epi32(51));

From 254826d3964e9f00d285ed11adde50882fef78f4 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Mon, 29 May 2023 10:36:18 +0300
Subject: [PATCH 238/254] [avx2] Add comments

---
 src/strategies/avx2/depquant-avx2.c | 96 ++++++++++++++++++++++++-----
 1 file changed, 81 insertions(+), 15 deletions(-)

diff --git a/src/strategies/avx2/depquant-avx2.c b/src/strategies/avx2/depquant-avx2.c
index 82ddd498..a6ac5a90 100644
--- a/src/strategies/avx2/depquant-avx2.c
+++ b/src/strategies/avx2/depquant-avx2.c
@@ -81,12 +81,18 @@ static void check_rd_costs_avx2(const all_depquant_states* const state, const en
 
 
   if (state->all_gte_four) {
+    // pqDataA
+    // In case the both levels are smaller than 4 or gte 4 avx 2 can be used
     if (pqDataA->absLevel[0] < 4 && pqDataA->absLevel[3] < 4) {
+      // The coeffFracBits arrays are 6 elements long, so we need to offset the indices and gather is only eficient way to load the data
       __m128i offsets = _mm_set_epi32(18 + pqDataA->absLevel[3], 12 + pqDataA->absLevel[3], 6 + pqDataA->absLevel[0], 0 + pqDataA->absLevel[0]);
       __m128i coeff_frac_bits = _mm_i32gather_epi32(&state->m_coeffFracBits[start][0], offsets, 4);
+      // RD costs are 64 bit, so we need to extend the 32 bit values
       __m256i ext_frac_bits = _mm256_cvtepi32_epi64(coeff_frac_bits);
       rd_cost_a = _mm256_add_epi64(rd_cost_a, ext_frac_bits);
-    } else if (pqDataA->absLevel[0] >= 4 && pqDataA->absLevel[3] >= 4) {
+    }
+
+    else if (pqDataA->absLevel[0] >= 4 && pqDataA->absLevel[3] >= 4) {
       __m128i value = _mm_set_epi32((pqDataA->absLevel[3] - 4) >> 1, (pqDataA->absLevel[3] - 4) >> 1, (pqDataA->absLevel[0] - 4) >> 1, (pqDataA->absLevel[0] - 4) >> 1);
 
       __m128i offsets = _mm_set_epi32(18 + pqDataA->absLevel[3], 12 + pqDataA->absLevel[3], 6 + pqDataA->absLevel[0], 0 + pqDataA->absLevel[0]);
@@ -96,6 +102,8 @@ static void check_rd_costs_avx2(const all_depquant_states* const state, const en
 
       __m128i max_rice = _mm_set1_epi32(31);
       value = _mm_min_epi32(value, max_rice);
+      // In the original implementation the goRiceTab is selected beforehand, but since we need to load from
+      // potentially four different locations, we need to calculate the offsets and use gather
       __m128i go_rice_tab = _mm_cvtepi8_epi32(_mm_loadu_si32(&state->m_goRicePar[start]));
       go_rice_tab = _mm_slli_epi32(go_rice_tab, 5);
       value = _mm_add_epi32(value, go_rice_tab);
@@ -104,7 +112,8 @@ static void check_rd_costs_avx2(const all_depquant_states* const state, const en
       rd_cost_a = _mm256_add_epi64(rd_cost_a, _mm256_cvtepi32_epi64(temp));
     } else {
       const int pqAs[4] = {0, 0, 3, 3};
-      ALIGNED(32) int64_t rd_costs[4] = {0, 0, 0, 0}; 
+      ALIGNED(32) int64_t rd_costs[4] = {0, 0, 0, 0};
+      // AVX2 cannot be used so we have to loop the values normally
       for (int i = 0; i < 4; i++) {
         const int      state_offset = start + i;
         const int      pqA = pqAs[i];
@@ -119,6 +128,7 @@ static void check_rd_costs_avx2(const all_depquant_states* const state, const en
       rd_cost_a = _mm256_add_epi64(rd_cost_a, _mm256_loadu_si256((__m256i const *)&rd_costs[0]));
     }
 
+    // pqDataB, same stuff as for pqDataA
     if (pqDataA->absLevel[1] < 4 && pqDataA->absLevel[2] < 4) {
       __m128i offsets = _mm_set_epi32(18 + pqDataA->absLevel[1], 12 + pqDataA->absLevel[1], 6 + pqDataA->absLevel[2], 0 + pqDataA->absLevel[2]);
       __m128i coeff_frac_bits = _mm_i32gather_epi32(state->m_coeffFracBits[start], offsets, 4);
@@ -159,6 +169,10 @@ static void check_rd_costs_avx2(const all_depquant_states* const state, const en
     }
 
     if (spt == SCAN_ISCSBB) {
+      // This loads values such as that the values are
+      // |State 0 Flag 0|State 0 Flag 1|State 1 Flag 0|State 1 Flag 1|State 2 Flag 0|State 2 Flag 1|State 3 Flag 0|State 3 Flag 1|
+      // By setting the flag 1 bits to zero we get the flag 0 values as 64 bit integers (even) variable which we can be summed to the rd_cost
+      // Flag 1 values can be shifted 32 to right and again we have 64 bit integeres holding the values (odd) which can be summed to the rd_cost
       __m256i original = _mm256_loadu_si256((__m256i const*)state->m_sigFracBits[start]);
       __m256i even      = _mm256_and_si256(original, _mm256_set1_epi64x(0xffffffff));
       __m256i odd  = _mm256_srli_epi64(original, 32);
@@ -168,6 +182,7 @@ static void check_rd_costs_avx2(const all_depquant_states* const state, const en
     } else if (spt == SCAN_SOCSBB) {
       __m256i original = _mm256_loadu_si256((__m256i const*)state->m_sigFracBits[start]);
 
+      // Same here
       __m256i m_sigFracBits_0 = _mm256_and_si256(original, _mm256_set1_epi64x(0xffffffff));
       __m256i m_sigFracBits_1 = _mm256_srli_epi64(original, 32);
 
@@ -185,6 +200,7 @@ static void check_rd_costs_avx2(const all_depquant_states* const state, const en
     else {
       int num_sig_sbb;
       memcpy(&num_sig_sbb, &state->m_numSigSbb[start], 4);
+      // numSigSbb only has values 1 or zero, so if all 4 values are 1 the complete value is 0x01010101
       if (num_sig_sbb == 0x01010101) {
         __m256i original = _mm256_loadu_si256((__m256i const*)state->m_sigFracBits[start]);
       __m256i even      = _mm256_and_si256(original, _mm256_set1_epi64x(0xffffffff));
@@ -224,25 +240,30 @@ static void check_rd_costs_avx2(const all_depquant_states* const state, const en
     // RD cost A
     {
       __m128i pq_abs_a = _mm_set_epi32(pqDataA->absLevel[3], pqDataA->absLevel[3], pqDataA->absLevel[0], pqDataA->absLevel[0]);
+      // Calculate mask for pqDataA->absLevel <= state->m_goRiceZero
+      // The mask is reverse of the one that is used in the scalar code so the values are in other order in blendv
       __m128i cmp = _mm_cmpgt_epi32(pq_abs_a, go_rice_zero);
-      
+
+      // pqDataA->absLevel < RICEMAX ? pqDataA->absLevel : RICEMAX - 1
       __m128i go_rice_smaller = _mm_min_epi32(pq_abs_a, max_rice);
 
+      // pqDataA->absLevel - 1
       __m128i other = _mm_sub_epi32(pq_abs_a, _mm_set1_epi32(1));
 
       __m128i selected = _mm_blendv_epi8(other, go_rice_smaller, cmp);
 
-
+      // Again calculate the offset for the different go_rice_tabs
       __m128i go_rice_offset = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)&state->m_goRicePar[start]));
       go_rice_offset = _mm_slli_epi32(go_rice_offset, 5);
 
       __m128i offsets = _mm_add_epi32(selected, go_rice_offset);
       __m128i go_rice_tab = _mm_i32gather_epi32(&g_goRiceBits[0][0], offsets, 4);
+      //(1 << SCALE_BITS) + goRiceTab[selected]
       __m128i temp = _mm_add_epi32(go_rice_tab, scale_bits);
 
       rd_cost_a = _mm256_add_epi64(rd_cost_a, _mm256_cvtepi32_epi64(temp));
     }
-    // RD cost b
+    // RD cost b, same as RD cost A
     {
       __m128i pq_abs_b = _mm_set_epi32(pqDataA->absLevel[1], pqDataA->absLevel[1], pqDataA->absLevel[2], pqDataA->absLevel[2]);
       __m128i cmp = _mm_cmpgt_epi32(pq_abs_b, go_rice_zero);
@@ -265,6 +286,7 @@ static void check_rd_costs_avx2(const all_depquant_states* const state, const en
     }
     // RD cost Z
     {
+      // This time the go_rice_tab is offset with only the go_rize_zero
       __m128i go_rice_offset = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)&state->m_goRicePar[start]));
       go_rice_offset = _mm_slli_epi32(go_rice_offset, 5);
 
@@ -325,6 +347,7 @@ static void check_rd_costs_avx2(const all_depquant_states* const state, const en
     rd_cost_b = _mm256_loadu_si256((__m256i*)temp_rd_cost_b);
     rd_cost_z = _mm256_loadu_si256((__m256i*)temp_rd_cost_z);
   }
+  // Re order the cost so that cost of state 0 is in the first element state 1 in second etc
   rd_cost_a = _mm256_permute4x64_epi64(rd_cost_a, 216);
   rd_cost_b = _mm256_permute4x64_epi64(rd_cost_b, 141);
   rd_cost_z = _mm256_permute4x64_epi64(rd_cost_z, 216);
@@ -334,8 +357,9 @@ static void check_rd_costs_avx2(const all_depquant_states* const state, const en
   __m256i decision_prev_state = _mm256_load_si256((__m256i*)decisions->prevId);
   __m256i decision_data = _mm256_permute2x128_si256(decision_abs_coeff, decision_prev_state, 0x20);
   __m256i mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
-  decision_data = _mm256_permutevar8x32_epi32(decision_data, mask);
 
+  // Store data for all of the cost so that the lower 32 bits have coefficient magnitude and upper have the previous state
+  decision_data = _mm256_permutevar8x32_epi32(decision_data, mask);
   __m256i a_data = _mm256_set_epi32(3, pqDataA->absLevel[3], 1, pqDataA->absLevel[0], 2, pqDataA->absLevel[3], 0, pqDataA->absLevel[0]);
   __m256i b_data = _mm256_set_epi32(2, pqDataA->absLevel[1], 0, pqDataA->absLevel[2], 3, pqDataA->absLevel[1], 1, pqDataA->absLevel[2]);
   __m256i z_data = _mm256_set_epi32(3, 0, 1, 0, 2, 0, 0, 0);
@@ -514,6 +538,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
         prev_state_no_offset,
         _mm_set1_epi32(ctxs->m_prev_state_offset)
       );
+      // Set the high bytes to 0xff so that the shuffle will set them to zero and it won't cause problems with the min_epi32
       __m128i prev_state_with_ff_high_bytes = _mm_or_si128(prev_state, _mm_set1_epi32(0xffffff00));
       __m128i num_sig_sbb = _mm_load_si128((const __m128i*)state->m_numSigSbb);
       num_sig_sbb = _mm_shuffle_epi8(num_sig_sbb, prev_state_with_ff_high_bytes);
@@ -526,9 +551,12 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
       int num_sig_sbb_s = _mm_extract_epi32(num_sig_sbb, 0);
       memcpy(&state->m_numSigSbb[state_offset], &num_sig_sbb_s, 4);
 
+      // Set this so that the temp_prev_state has the previous state set into the first 4 bytes and duplicated to the second 4 bytes
       __m128i temp_prev_state = _mm_shuffle_epi8(prev_state_no_offset, control);
       __m256i prev_state_256 = _mm256_castsi128_si256(temp_prev_state);
+      // Duplicate the state all over the vector so that all 32 bytes hold the previous states
       prev_state_256 = _mm256_permute4x64_epi64(prev_state_256, 0);
+      // Increment the second set by four, third by eight and fourth by twelve and repeat for the second lane
       __m256i temp_add = _mm256_setr_epi32(0, 0x04040404, 0x08080808, 0x0c0c0c0c, 0, 0x04040404, 0x08080808, 0x0c0c0c0c);
       prev_state_256 = _mm256_add_epi8(prev_state_256, temp_add);
       for (int i = 0; i < 64; i += (256 / (8 * sizeof(uint8_t)))) {
@@ -537,6 +565,8 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
         _mm256_store_si256((__m256i*)&state->m_absLevels[ctxs->m_curr_state_offset >> 2][i], data);
       }
     } else {
+      // TODO: it would be possible to do the absLevels update with avx2 even here just would need to set the shuffle mask to
+      // 0xff for the states that don't have previous state or the previous state is a skip state
       int prev_state_s[4] = {-1, -1, -1, -1};
       for (int i = 0; i < 4; ++i) {
         const int decision_id = i;
@@ -584,14 +614,18 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
 
       if (all_have_previous_state) {
         __m128i temp_p_state = _mm_shuffle_epi8(prev_state, control);
-        __m128i ref_sbb_ctx_offset =
-          _mm_load_si128((__m128i*)ctxs->m_allStates.m_refSbbCtxId);
+        // Similarly to how the abs level was done earlier set the previous state duplicated across the lane
+        __m128i ref_sbb_ctx_offset = _mm_load_si128((__m128i*)ctxs->m_allStates.m_refSbbCtxId);
         ref_sbb_ctx_offset = _mm_shuffle_epi8(ref_sbb_ctx_offset, temp_p_state);
+        // numSbb is two or four, in case it is one this function is never called
         if (numSbb <= 4) {
           __m128i incremented_ref_sbb_ctx_offset = _mm_add_epi8(
             ref_sbb_ctx_offset,
             _mm_setr_epi8(0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12)
           );
+          // In case the ref_sbb_ctx is minus one the values need to be set to zero, which is achieved by
+          // first finding which states have the minus one and then the blend is used after the load to
+          // set the corresponding values to zero
           __m128i blend_mask = _mm_cmpeq_epi8(ref_sbb_ctx_offset, _mm_set1_epi32(0xffffffff));
           __m128i sbb_flags = _mm_loadu_si128((__m128i*)cc->m_allSbbCtx[cc->m_prev_sbb_ctx_offset].sbbFlags);
           sbb_flags = _mm_shuffle_epi8(sbb_flags, incremented_ref_sbb_ctx_offset);
@@ -609,6 +643,10 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
             extended_ref_state,
             _mm256_setr_epi32(0, 0x04040404, 0x08080808, 0x0c0c0c0c,0, 0x04040404, 0x08080808, 0x0c0c0c0c)
           );
+          // Unlike the case for two or four sbb, the blendv is used to set the shuffle mask to -1 so that
+          // the shuffle will set the values to zero. Its better to do this way here so that the blendv is
+          // not called in the loop, and the other is done the otherway because I implemented it first
+          // and only realized afterwards that this order is better
           __m256i blend_mask = _mm256_cmpeq_epi8(extended_ref_state, _mm256_set1_epi32(0xffffffff));
           inc_ref_state = _mm256_blendv_epi8(inc_ref_state, _mm256_set1_epi32(0xffffffff), blend_mask);
           for (int i = 0; i < numSbb * 4; i += 32) {
@@ -617,9 +655,12 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
             _mm256_store_si256((__m256i*)&sbbFlags[i], sbb_flags);
           }
         }
+        // The first 16 variables will be loaded from the previous state so this can be started from 16
         int levels_start = 16;
+        // Do avx2 optimized version for the amount that is divisible by 8 (four states of 8 1-byte values)
         const uint64_t limit        = setCpSize & ~(8 - 1);
         if (levels_start < limit) {
+          // Overall this is the same to the numSbb > 4
           __m256i extended_ref_state = _mm256_zextsi128_si256(ref_sbb_ctx_offset);
           extended_ref_state = _mm256_permute4x64_epi64(extended_ref_state, 0);
           __m256i inc_ref_state = _mm256_add_epi8(
@@ -637,6 +678,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
         uint8_t ref_sbb[4];
         int     temp_sbb_ref = _mm_extract_epi32(ref_sbb_ctx_offset, 0);
         memcpy(ref_sbb, &temp_sbb_ref, 4);
+        // Do the excess that is not divisible by 8
         for (;levels_start < setCpSize; ++levels_start) {
           uint8_t new_values[4];
           new_values[0] = ref_sbb[0] != 0xff ? levels_in[levels_start * 4 + ref_sbb[0]] : 0;
@@ -648,6 +690,8 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
 
       }
       else {
+        //TODO: This could also be done using avx2 just need to check for both wheter the previous state
+        // is minus one and that if the ref_sbb_ctx_id is minus one. 
         for (int curr_state = 0; curr_state < 4; ++curr_state) {
           const int p_state = previous_state_array[curr_state];
           if (p_state != -1 && ctxs->m_allStates.m_refSbbCtxId[p_state] >= 0) {
@@ -681,6 +725,8 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
 
       __m128i sig_sbb = _mm_or_si128(sbb_right, sbb_below);
       sig_sbb = _mm_min_epi32(sig_sbb, _mm_set1_epi32(1));
+      // Gather is not necessary here put it would require at least five operation to do the same thing
+      // so the performance gain in my opinion is not worth the readability loss
       __m256i sbb_frac_bits = _mm256_i32gather_epi64((int64_t *)cc->m_sbbFlagBits[0], sig_sbb, 8);
       _mm256_store_si256((__m256i*)state->m_sbbFracBits[state_offset], sbb_frac_bits);
 
@@ -806,6 +852,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
       
       memset(state->m_absLevels[state_offset >> 2], 0, 16 * 4);      
     }
+    // End update common context
 
     __m128i sum_num = _mm_and_si128(last, _mm_set1_epi32(7));
     __m128i sum_abs1 = _mm_and_si128(
@@ -829,6 +876,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
     __m128i min_gt1 = _mm_min_epi32(sum_gt1, _mm_set1_epi32(4));
     uint32_t sum_gt1_s[4];
     _mm_storeu_si128((__m128i*)sum_gt1_s, min_gt1);
+    // These are 192 bits so no benefit from using avx2
     for (int i = 0; i < 4; ++i) {
       memcpy(state->m_coeffFracBits[state_offset + i], state->m_gtxFracBitsArray[sum_gt1_s[i] + gtxCtxOffsetNext], sizeof(state->m_coeffFracBits[0]));
     }
@@ -887,7 +935,9 @@ static INLINE void update_states_avx2(
       __m128i prev_offset = _mm_set1_epi32(ctxs->m_prev_state_offset);
       __m128i prv_states     = _mm_add_epi32(prv_states_o, prev_offset);
       __m128i shuffled_prev_states = _mm_shuffle_epi8(prv_states, control);
-      
+
+      // sig_sbb values matter only whether they are one or zero so make sure that they stay at one or zero
+      // which allows some optimizations when handling the values in update_state_eos_avx2
       __m128i sig_sbb   = _mm_load_si128((__m128i const*)state->m_numSigSbb);
       sig_sbb = _mm_shuffle_epi8(sig_sbb, shuffled_prev_states);
       __m128i has_coeff = _mm_min_epi32(abs_level, _mm_set1_epi32(1));
@@ -895,7 +945,8 @@ static INLINE void update_states_avx2(
       sig_sbb           = _mm_or_si128(sig_sbb, has_coeff);
       int sig_sbb_i = _mm_extract_epi32(sig_sbb, 0);
       memcpy(&state->m_numSigSbb[state_offset], &sig_sbb_i, 4);
-      
+
+      // These following two are jus shuffled and then extracted the 4 bytes that store the values
       __m128i ref_sbb_ctx_idx = _mm_load_si128((__m128i const*)state->m_refSbbCtxId);
       ref_sbb_ctx_idx = _mm_shuffle_epi8(ref_sbb_ctx_idx, shuffled_prev_states);
       int ref_sbb_ctx = _mm_extract_epi32(ref_sbb_ctx_idx, 0);
@@ -906,23 +957,30 @@ static INLINE void update_states_avx2(
       int go_rice_par_i = _mm_extract_epi32(go_rice_par, 0);
       memcpy(&state->m_goRicePar[state_offset], &go_rice_par_i, 4);
 
-      
+      // Again gather is not necessary but it is easier to read and shouldn't have too large of a performance hit
+      // Should be true for all gathers here
       __m256i sbb_frac_bits = _mm256_i32gather_epi64((const int64_t *)state->m_sbbFracBits[0], prv_states, 8);
       _mm256_store_si256((__m256i*)&state->m_sbbFracBits[state_offset][0], sbb_frac_bits);
 
+      // Next three lines: state->m_remRegBins = prvState->m_remRegBins - 1;
       __m128i rem_reg_bins = _mm_i32gather_epi32(state->m_remRegBins, prv_states, 4);
       __m128i ones = _mm_set1_epi32(1);
       rem_reg_bins = _mm_sub_epi32(rem_reg_bins, ones);
 
       __m128i reg_bins_sub = _mm_set1_epi32(0);
+      // Next two lines: (decision->absLevel < 2 ? (unsigned)decision->absLevel : 3)
       __m128i abs_level_smaller_than_two = _mm_cmplt_epi32(abs_level, _mm_set1_epi32(2));
       __m128i secondary = _mm_blendv_epi8(_mm_set1_epi32(3), abs_level, abs_level_smaller_than_two);
 
+      // Depending on whether the rem_reg_bins are smaller than four or not,
+      // the reg_bins_sub is either 0 or result of the above operation
       __m128i rem_reg_bins_smaller_than_four = _mm_cmplt_epi32(rem_reg_bins, _mm_set1_epi32(4));
       reg_bins_sub = _mm_blendv_epi8(secondary, reg_bins_sub, rem_reg_bins_smaller_than_four);
       rem_reg_bins = _mm_sub_epi32(rem_reg_bins, reg_bins_sub);
       _mm_store_si128((__m128i*)&state->m_remRegBins[state_offset], rem_reg_bins);
 
+      // Save whether all rem_reg_bins are smaller than four or not and gte 4 as these
+      // are needed in multiple places
       __m128i mask = _mm_cmpgt_epi32(rem_reg_bins, _mm_set1_epi32(3)); 
       int     bit_mask = _mm_movemask_epi8(mask);           
       rem_reg_all_gte_4 = (bit_mask == 0xFFFF);
@@ -930,7 +988,7 @@ static INLINE void update_states_avx2(
       bit_mask = _mm_movemask_epi8(mask); 
       rem_reg_all_lt4 = (bit_mask == 0xFFFF);
 
-
+      // This is the same as in update_state_eos_avx2
       __m128i temp_prev_state = _mm_shuffle_epi8(prv_states_o, control);
       __m256i prev_state_256  = _mm256_castsi128_si256(temp_prev_state);
       prev_state_256          = _mm256_permute4x64_epi64(prev_state_256, 0);
@@ -950,15 +1008,21 @@ static INLINE void update_states_avx2(
         _mm256_store_si256((__m256i*)&state->m_absLevels[ctxs->m_curr_state_offset >> 2][i], data);
       }
 
+      // This is overall the same as absLevels but since the ctx values are two bytes all of the
+      // masks have to account for that
       __m256i prev_state_full = _mm256_load_si256((__m256i const*)decisions->prevId);
       __m256i shuffle_mask = _mm256_setr_epi8(0, 0, 4, 4,8, 8, 12, 12, 0, 0, 4, 4, 8, 8, 12, 12,0, 0, 0, 0,0, 0, 0, 0,16, 16, 16, 16, 16, 16, 16, 16);
       prev_state_full = _mm256_shuffle_epi8(prev_state_full, shuffle_mask);
       prev_state_full  = _mm256_permute4x64_epi64(prev_state_full, 0);
       prev_state_full  = _mm256_slli_epi16(prev_state_full, 1);
-      temp_add = _mm256_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 8, 9, 8, 9, 8, 9, 16, 17, 16, 17,16, 17,16, 17, 24, 25,24,25,24,25,24,25);
+      temp_add = _mm256_setr_epi8(
+        0, 1, 0, 1, 0, 1, 0, 1,
+        8, 9, 8, 9, 8, 9, 8, 9, 
+        16, 17, 16, 17, 16, 17, 16, 17,
+        24, 25, 24, 25, 24, 25, 24, 25);
       prev_state_full = _mm256_add_epi8(prev_state_full, temp_add);
 
-      for (int i = 0; i < 64; i += (256 / 8 / sizeof(uint16_t))) {
+      for (int i = 0; i < 64; i += (256 / (8 * sizeof(uint16_t)))) {
          __m256i data   = _mm256_load_si256((__m256i*)(&state->m_ctxInit[(ctxs->m_prev_state_offset >> 2)][i]));
         data = _mm256_shuffle_epi8(data, prev_state_full);
         _mm256_store_si256((__m256i*)(&state->m_ctxInit[(state_offset >> 2)][i]), data);
@@ -1016,6 +1080,7 @@ static INLINE void update_states_avx2(
         rem_reg_all_lt4 &= state->m_remRegBins[state_id] < 4;
       }
       {
+        // Same as for the all_non_negative but use blendv to set the shuffle mask to -1 for the states that do not have previous state
         __m256i prev_state_full = _mm256_load_si256((__m256i const*)decisions->prevId);
         __m256i shuffle_mask = _mm256_setr_epi8(0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
         prev_state_full = _mm256_shuffle_epi8(prev_state_full, shuffle_mask);
@@ -1198,6 +1263,7 @@ static INLINE void update_states_avx2(
       if (extRiceFlag) {
         assert(0 && "Not implemented for avx2");
       } else {
+        // int sumAll = MAX(MIN(31, (int)sumAbs - 4 * 5), 0);
         __m128i sum_all = _mm_max_epi32(
           _mm_min_epi32(
             _mm_set1_epi32(31),
@@ -1257,7 +1323,7 @@ static INLINE void update_states_avx2(
         int     go_rice_par_i = _mm_extract_epi32(go_rice_par, 0);
         memcpy(&state->m_goRicePar[state_offset], &go_rice_par_i, 4);
 
-        
+        // This cannot be vectorized because there is no way to dynamically shift values
         for (int i = 0; i < 4; ++i) {
           state->m_goRiceZero[state_offset + i] = (i < 2 ? 1 : 2) << state->m_goRicePar[state_offset + i];          
         }

From 2caf077cff6e28f5d6db2eefc8f1d6c27b5963ed Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Mon, 29 May 2023 12:18:08 +0300
Subject: [PATCH 239/254] Remove avx512 instrincis

---
 src/strategies/avx2/depquant-avx2.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/strategies/avx2/depquant-avx2.c b/src/strategies/avx2/depquant-avx2.c
index a6ac5a90..357932f9 100644
--- a/src/strategies/avx2/depquant-avx2.c
+++ b/src/strategies/avx2/depquant-avx2.c
@@ -104,7 +104,7 @@ static void check_rd_costs_avx2(const all_depquant_states* const state, const en
       value = _mm_min_epi32(value, max_rice);
       // In the original implementation the goRiceTab is selected beforehand, but since we need to load from
       // potentially four different locations, we need to calculate the offsets and use gather
-      __m128i go_rice_tab = _mm_cvtepi8_epi32(_mm_loadu_si32(&state->m_goRicePar[start]));
+      __m128i go_rice_tab = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i *)&state->m_goRicePar[start]));
       go_rice_tab = _mm_slli_epi32(go_rice_tab, 5);
       value = _mm_add_epi32(value, go_rice_tab);
 
@@ -144,7 +144,7 @@ static void check_rd_costs_avx2(const all_depquant_states* const state, const en
 
       __m128i max_rice = _mm_set1_epi32(31);
       value = _mm_min_epi32(value, max_rice);
-      __m128i go_rice_tab = _mm_cvtepi8_epi32(_mm_loadu_si32(&state->m_goRicePar[start]));
+      __m128i go_rice_tab = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)&state->m_goRicePar[start]));
       go_rice_tab = _mm_slli_epi32(go_rice_tab, 5);
       value = _mm_add_epi32(value, go_rice_tab);
 
@@ -727,7 +727,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
       sig_sbb = _mm_min_epi32(sig_sbb, _mm_set1_epi32(1));
       // Gather is not necessary here put it would require at least five operation to do the same thing
       // so the performance gain in my opinion is not worth the readability loss
-      __m256i sbb_frac_bits = _mm256_i32gather_epi64((int64_t *)cc->m_sbbFlagBits[0], sig_sbb, 8);
+      __m256i sbb_frac_bits = _mm256_i32gather_epi64((const long long int *)cc->m_sbbFlagBits[0], sig_sbb, 8);
       _mm256_store_si256((__m256i*)state->m_sbbFracBits[state_offset], sbb_frac_bits);
 
       memset(&state->m_numSigSbb[state_offset], 0, 4);
@@ -868,7 +868,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
     __m128i offsets = _mm_set_epi32(12 * 3, 12 * 2, 12 * 1, 12 * 0);
     offsets = _mm_add_epi32(offsets, _mm_set1_epi32(sigCtxOffsetNext));
     offsets         = _mm_add_epi32(offsets, sum_abs_min);
-    __m256i sig_frac_bits = _mm256_i32gather_epi64((const int64_t *)&state->m_sigFracBitsArray[state_offset][0][0], offsets, 8);
+    __m256i sig_frac_bits = _mm256_i32gather_epi64((long long const*)&state->m_sigFracBitsArray[state_offset][0][0], offsets, 8);
     _mm256_store_si256((__m256i*)&state->m_sigFracBits[state_offset][0], sig_frac_bits);
 
 
@@ -959,7 +959,7 @@ static INLINE void update_states_avx2(
 
       // Again gather is not necessary but it is easier to read and shouldn't have too large of a performance hit
       // Should be true for all gathers here
-      __m256i sbb_frac_bits = _mm256_i32gather_epi64((const int64_t *)state->m_sbbFracBits[0], prv_states, 8);
+      __m256i sbb_frac_bits = _mm256_i32gather_epi64((const long long *)state->m_sbbFracBits[0], prv_states, 8);
       _mm256_store_si256((__m256i*)&state->m_sbbFracBits[state_offset][0], sbb_frac_bits);
 
       // Next three lines: state->m_remRegBins = prvState->m_remRegBins - 1;
@@ -1218,7 +1218,7 @@ static INLINE void update_states_avx2(
         _mm_srli_epi32(_mm_add_epi32(sum_abs1, ones), 1),
         _mm_set1_epi32(3));
       offsets = _mm_add_epi32(offsets, temp);
-      __m256i sig_frac_bits = _mm256_i32gather_epi64((const int64_t *)state->m_sigFracBitsArray[state_offset][0], offsets, 8);
+      __m256i sig_frac_bits = _mm256_i32gather_epi64((const long long *)state->m_sigFracBitsArray[state_offset][0], offsets, 8);
       _mm256_store_si256((__m256i*)&state->m_sigFracBits[state_offset][0], sig_frac_bits);
 
       sum_gt1 = _mm_min_epi32(sum_gt1, _mm_set1_epi32(4));

From 1c293b82536d38b9ab22792c7c1d970e1c4f58a4 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Fri, 21 Jul 2023 10:49:46 +0300
Subject: [PATCH 240/254] pass context_store as pointer

This reverts commit 47c5ea3d5cc6db555511b31d525069eb9f2346ca.
---
 CMakeLists.txt                            |  2 +-
 src/dep_quant.c                           | 71 ++++-------------------
 src/strategies/avx2/depquant-avx2.c       | 14 +----
 src/strategies/generic/depquant-generic.c |  6 +-
 src/strategies/strategies-depquant.h      | 18 +++---
 5 files changed, 27 insertions(+), 84 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index cafb8fd8..d8c37bbc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -143,7 +143,7 @@ target_include_directories(uvg266 PUBLIC src)
 target_include_directories(uvg266 PUBLIC src/extras)
 target_include_directories(uvg266 PUBLIC src/strategies)
 
-file(GLOB LIB_SOURCES_STRATEGIES_AVX2 RELATIVE ${PROJECT_SOURCE_DIR} "src/strategies/avx2/*.c" "src/dep_quant.c")
+file(GLOB LIB_SOURCES_STRATEGIES_AVX2 RELATIVE ${PROJECT_SOURCE_DIR} "src/strategies/avx2/*.c")
 file(GLOB LIB_SOURCES_STRATEGIES_SSE41 RELATIVE ${PROJECT_SOURCE_DIR} "src/strategies/sse41/*.c")
 file(GLOB LIB_SOURCES_STRATEGIES_SSE42 RELATIVE ${PROJECT_SOURCE_DIR} "src/strategies/sse42/*.c")
 
diff --git a/src/dep_quant.c b/src/dep_quant.c
index 2656f9aa..8513cf77 100644
--- a/src/dep_quant.c
+++ b/src/dep_quant.c
@@ -32,8 +32,6 @@
 
 #include "dep_quant.h"
 
-#include <immintrin.h>
-
 #include "cu.h"
 #include "encoderstate.h"
 #include "intra.h"
@@ -923,63 +921,15 @@ int uvg_dep_quant(
     height >= 4) {
     firstTestPos =((width == 4 && height == 4) || (width == 8 && height == 8)) ? 7 : 15;
   }
-  //uvg_find_first_non_zero_coeff(srcCoeff, enableScalingLists, dep_quant_context, scan, q_coeff, &firstTestPos, width, height);
-  const int     default_quant_coeff = dep_quant_context.m_quant->m_QScale;
-  const int32_t thres = dep_quant_context.m_quant->m_thresLast;
-  int           temp = firstTestPos;
-  if (enableScalingLists) {
-    for (; temp >= 0; (temp)--) {
-      coeff_t thresTmp = thres / (4 * q_coeff[scan[(temp)]]);
-      if (abs(srcCoeff[scan[(temp)]]) > thresTmp) {
-        break;
-      }
-    }
-  } else {
-    coeff_t thresTmp = thres / (4 * default_quant_coeff);
-    if (temp >= 16 && height >= 4) {
-      __m256i th = _mm256_set1_epi16(thresTmp);
-      temp -= 15;
-      for (; temp >= 0; temp -= 16) {
-        __m256i sbb_data;
-        if (width <= 4) {
-          sbb_data = _mm256_loadu_si256((__m256i const*)&srcCoeff[scan[temp]]);
-        } else if (width == 8) {
-          uint32_t i = scan[temp];
-          __m256i  first = _mm256_loadu_si256((__m256i const*)&srcCoeff[i]);
-          __m256i  second = _mm256_loadu_si256((__m256i const*)&srcCoeff[i + 12]);
-          sbb_data = _mm256_blend_epi32(first, second, 204);
-        } else {
-          int16_t  temp_d[16];
-          uint32_t i = scan[temp];
-          memcpy(temp_d, &srcCoeff[i], 8);
-          i += width;
-          memcpy(temp_d + 4, &srcCoeff[i], 8);
-          i += width;
-          memcpy(temp_d + 8, &srcCoeff[i], 8);
-          i += width;
-          memcpy(temp_d + 12, &srcCoeff[i], 8);
-
-          sbb_data = _mm256_loadu_si256((__m256i const*)temp_d);
-        }
-        sbb_data = _mm256_abs_epi16(sbb_data);
-
-        __m256i a = _mm256_cmpgt_epi16(sbb_data, th);
-        if (!_mm256_testz_si256(a, a)) {
-          if (temp >= 0) {
-            temp += 15;
-          }
-          break;
-        }
-      }
-    }
-    for (; temp >= 0; temp--) {
-      if (abs(srcCoeff[scan[(temp)]]) > thresTmp) {
-        break;
-      }
-    }
-  }
-
-  firstTestPos = temp;
+  uvg_find_first_non_zero_coeff(
+    srcCoeff,
+    enableScalingLists,
+    &dep_quant_context,
+    scan,
+    q_coeff,
+    &firstTestPos,
+    width, 
+    height);
   if (firstTestPos < 0) {
     return 0;
   }
@@ -1044,7 +994,8 @@ int uvg_dep_quant(
 
   const uint32_t height_in_sbb = MAX(height >> 2, 1);
   const uint32_t width_in_sbb = MAX(width >> 2, 1);
-  
+
+  const int      default_quant_coeff = dep_quant_context.m_quant->m_QScale;
   //===== populate trellis =====
   for (int scanIdx = firstTestPos; scanIdx >= 0; scanIdx--) {
     uint32_t blkpos = scan[scanIdx];
diff --git a/src/strategies/avx2/depquant-avx2.c b/src/strategies/avx2/depquant-avx2.c
index 357932f9..5ef1936e 100644
--- a/src/strategies/avx2/depquant-avx2.c
+++ b/src/strategies/avx2/depquant-avx2.c
@@ -1482,18 +1482,10 @@ void uvg_dep_quant_decide_and_update_avx2(
 }
 
 
-void uvg_find_first_non_zero_avx2(
-  const coeff_t* srcCoeff,
-  const bool enableScalingLists,
-  context_store dep_quant_context, 
-  const uint32_t* const scan,
-  const int32_t* q_coeff,
-  int* firstTestPos, 
-  const int width,
-  const int height)
+void uvg_find_first_non_zero_avx2(const coeff_t* srcCoeff, const bool enableScalingLists, const context_store * const dep_quant_context, const uint32_t* const scan, const int32_t* q_coeff, int* firstTestPos, const int width, const int height)
 {
-  const int default_quant_coeff = dep_quant_context.m_quant->m_QScale;
-  const int32_t thres  = dep_quant_context.m_quant->m_thresLast;
+  const int default_quant_coeff = dep_quant_context->m_quant->m_QScale;
+  const int32_t thres  = dep_quant_context->m_quant->m_thresLast;
   int temp = *firstTestPos;
   if (enableScalingLists) {
     for (; temp >= 0; (temp)--) {
diff --git a/src/strategies/generic/depquant-generic.c b/src/strategies/generic/depquant-generic.c
index f1103054..b15ef52b 100644
--- a/src/strategies/generic/depquant-generic.c
+++ b/src/strategies/generic/depquant-generic.c
@@ -227,10 +227,10 @@ static void uvg_dep_quant_decide_and_update_generic(
 }
 
 
-void uvg_find_first_non_zero_generic(const coeff_t* srcCoeff, const bool enableScalingLists, context_store dep_quant_context, const uint32_t* const scan, const int32_t* q_coeff, int* firstTestPos, int width, int height)
+void uvg_find_first_non_zero_generic(const coeff_t* srcCoeff, const bool enableScalingLists, const context_store * const dep_quant_context, const uint32_t* const scan, const int32_t* q_coeff, int* firstTestPos, int width, int height)
 {
-  const int default_quant_coeff = dep_quant_context.m_quant->m_QScale;
-  const int32_t thres  = dep_quant_context.m_quant->m_thresLast;
+  const int default_quant_coeff = dep_quant_context->m_quant->m_QScale;
+  const int32_t thres  = dep_quant_context->m_quant->m_thresLast;
   int temp = *firstTestPos;
   for (; temp >= 0; (temp)--) {
     coeff_t thresTmp = (enableScalingLists) ? (thres / (4 * q_coeff[scan[(temp)]])) : (thres / (4 * default_quant_coeff));
diff --git a/src/strategies/strategies-depquant.h b/src/strategies/strategies-depquant.h
index 6a49dc35..5a58a3c7 100644
--- a/src/strategies/strategies-depquant.h
+++ b/src/strategies/strategies-depquant.h
@@ -61,15 +61,15 @@ typedef int(dep_quant_decide_and_update_func)(
   const uint32_t                          effHeight,
   bool                                    is_chroma);
 
-typedef void(find_first_non_zero_coeff_func)(
-  const coeff_t*        srcCoeff,
-  const bool            enableScalingLists,
-  context_store         dep_quant_context,
-  const uint32_t* const scan,
-  const int32_t*        q_coeff,
-  int*                  firstTestPos,
-  int                   width,
-  int                   height);
+typedef void (find_first_non_zero_coeff_func)(
+  const coeff_t*             srcCoeff,
+  const bool                 enableScalingLists,
+  const context_store* const dep_quant_context,
+  const uint32_t* const      scan,
+  const int32_t*             q_coeff,
+  int*                       firstTestPos,
+  int                        width,
+  int                        height);
 
 
 // Declare function pointers.

From 19829da152e62d0c996ddf550c2f3ef313d4d09e Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Fri, 21 Jul 2023 14:23:37 +0300
Subject: [PATCH 241/254] Disable all avx2 optimizations that cannot be used
 with mtt/isp

---
 src/strategies/avx2/dct-avx2.c     | 24 ++++++-------
 src/strategies/avx2/intra-avx2.c   |  8 ++---
 src/strategies/avx2/picture-avx2.c | 58 +++++++++++++-----------------
 src/strategies/avx2/quant-avx2.c   |  2 +-
 4 files changed, 42 insertions(+), 50 deletions(-)

diff --git a/src/strategies/avx2/dct-avx2.c b/src/strategies/avx2/dct-avx2.c
index 04e92a7f..bb8a92bc 100644
--- a/src/strategies/avx2/dct-avx2.c
+++ b/src/strategies/avx2/dct-avx2.c
@@ -1656,22 +1656,22 @@ int uvg_strategy_register_dct_avx2(void* opaque, uint8_t bitdepth)
 #if COMPILE_INTEL_AVX2
 #if UVG_BIT_DEPTH == 8
   if (bitdepth == 8){
-    success &= uvg_strategyselector_register(opaque, "fast_forward_dst_4x4", "avx2", 40, &matrix_dst_4x4_avx2);
+    //success &= uvg_strategyselector_register(opaque, "fast_forward_dst_4x4", "avx2", 40, &matrix_dst_4x4_avx2);
 
-    success &= uvg_strategyselector_register(opaque, "dct_4x4", "avx2", 40, &matrix_dct_4x4_avx2);
-    success &= uvg_strategyselector_register(opaque, "dct_8x8", "avx2", 40, &matrix_dct_8x8_avx2);
-    success &= uvg_strategyselector_register(opaque, "dct_16x16", "avx2", 40, &matrix_dct_16x16_avx2);
-    success &= uvg_strategyselector_register(opaque, "dct_32x32", "avx2", 40, &matrix_dct_32x32_avx2);
+    //success &= uvg_strategyselector_register(opaque, "dct_4x4", "avx2", 40, &matrix_dct_4x4_avx2);
+    //success &= uvg_strategyselector_register(opaque, "dct_8x8", "avx2", 40, &matrix_dct_8x8_avx2);
+    //success &= uvg_strategyselector_register(opaque, "dct_16x16", "avx2", 40, &matrix_dct_16x16_avx2);
+    //success &= uvg_strategyselector_register(opaque, "dct_32x32", "avx2", 40, &matrix_dct_32x32_avx2);
 
-    success &= uvg_strategyselector_register(opaque, "fast_inverse_dst_4x4", "avx2", 40, &matrix_idst_4x4_avx2);
+    //success &= uvg_strategyselector_register(opaque, "fast_inverse_dst_4x4", "avx2", 40, &matrix_idst_4x4_avx2);
 
-    success &= uvg_strategyselector_register(opaque, "idct_4x4", "avx2", 40, &matrix_idct_4x4_avx2);
-    success &= uvg_strategyselector_register(opaque, "idct_8x8", "avx2", 40, &matrix_idct_8x8_avx2);
-    success &= uvg_strategyselector_register(opaque, "idct_16x16", "avx2", 40, &matrix_idct_16x16_avx2);
-    success &= uvg_strategyselector_register(opaque, "idct_32x32", "avx2", 40, &matrix_idct_32x32_avx2);
+    //success &= uvg_strategyselector_register(opaque, "idct_4x4", "avx2", 40, &matrix_idct_4x4_avx2);
+    //success &= uvg_strategyselector_register(opaque, "idct_8x8", "avx2", 40, &matrix_idct_8x8_avx2);
+    //success &= uvg_strategyselector_register(opaque, "idct_16x16", "avx2", 40, &matrix_idct_16x16_avx2);
+    //success &= uvg_strategyselector_register(opaque, "idct_32x32", "avx2", 40, &matrix_idct_32x32_avx2);
 
-    success &= uvg_strategyselector_register(opaque, "mts_dct", "avx2", 40, &mts_dct_avx2);
-    success &= uvg_strategyselector_register(opaque, "mts_idct", "avx2", 40, &mts_idct_avx2);
+    //success &= uvg_strategyselector_register(opaque, "mts_dct", "avx2", 40, &mts_dct_avx2);
+    //success &= uvg_strategyselector_register(opaque, "mts_idct", "avx2", 40, &mts_idct_avx2);
 
   }
 #endif // UVG_BIT_DEPTH == 8
diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c
index 838bad91..30bbe7f2 100644
--- a/src/strategies/avx2/intra-avx2.c
+++ b/src/strategies/avx2/intra-avx2.c
@@ -1075,10 +1075,10 @@ int uvg_strategy_register_intra_avx2(void* opaque, uint8_t bitdepth)
 #if COMPILE_INTEL_AVX2 && defined X86_64
 #if UVG_BIT_DEPTH == 8
   if (bitdepth == 8) {
-    success &= uvg_strategyselector_register(opaque, "angular_pred", "avx2", 40, &uvg_angular_pred_avx2);
-    success &= uvg_strategyselector_register(opaque, "intra_pred_planar", "avx2", 40, &uvg_intra_pred_planar_avx2);
-    success &= uvg_strategyselector_register(opaque, "intra_pred_filtered_dc", "avx2", 40, &uvg_intra_pred_filtered_dc_avx2);
-    success &= uvg_strategyselector_register(opaque, "pdpc_planar_dc", "avx2", 40, &uvg_pdpc_planar_dc_avx2);
+    //success &= uvg_strategyselector_register(opaque, "angular_pred", "avx2", 40, &uvg_angular_pred_avx2);
+    //success &= uvg_strategyselector_register(opaque, "intra_pred_planar", "avx2", 40, &uvg_intra_pred_planar_avx2);
+    //success &= uvg_strategyselector_register(opaque, "intra_pred_filtered_dc", "avx2", 40, &uvg_intra_pred_filtered_dc_avx2);
+    //success &= uvg_strategyselector_register(opaque, "pdpc_planar_dc", "avx2", 40, &uvg_pdpc_planar_dc_avx2);
   }
 #endif //UVG_BIT_DEPTH == 8
 #endif //COMPILE_INTEL_AVX2 && defined X86_64
diff --git a/src/strategies/avx2/picture-avx2.c b/src/strategies/avx2/picture-avx2.c
index 5d0b203c..f8be4987 100644
--- a/src/strategies/avx2/picture-avx2.c
+++ b/src/strategies/avx2/picture-avx2.c
@@ -1749,35 +1749,27 @@ static void generate_residual_avx2(const uint8_t* ref_in, const uint8_t* pred_in
   __m128i diff = _mm_setzero_si128();
   switch (width) {
   case 4:
-    diff = get_residual_4x1_avx2(ref_in + 0 * ref_stride, pred_in + 0 * pred_stride);
-    _mm_storel_epi64((__m128i*) & (residual[0]), diff);
-    diff = get_residual_4x1_avx2(ref_in + 1 * ref_stride, pred_in + 1 * pred_stride);
-    _mm_storel_epi64((__m128i*) & (residual[4]), diff);
-    diff = get_residual_4x1_avx2(ref_in + 2 * ref_stride, pred_in + 2 * pred_stride);
-    _mm_storel_epi64((__m128i*) & (residual[8]), diff);
-    diff = get_residual_4x1_avx2(ref_in + 3 * ref_stride, pred_in + 3 * pred_stride);
-    _mm_storel_epi64((__m128i*) & (residual[12]), diff);
+    for (int y = 0; y < height; y+=4) {
+      diff = get_residual_4x1_avx2(ref_in + y * ref_stride, pred_in + y * pred_stride);
+      _mm_storel_epi64((__m128i*) & (residual[y * 4]), diff);
+      diff = get_residual_4x1_avx2(ref_in + (y + 1) * ref_stride, pred_in + (y + 1) * pred_stride);
+      _mm_storel_epi64((__m128i*) & (residual[y * 4 + 4]), diff);
+      diff = get_residual_4x1_avx2(ref_in + (y + 2) * ref_stride, pred_in + (y + 2) * pred_stride);
+      _mm_storel_epi64((__m128i*) & (residual[y * 4 + 8]), diff);
+      diff = get_residual_4x1_avx2(ref_in + (y + 3) * ref_stride, pred_in + (y + 3) * pred_stride);
+      _mm_storel_epi64((__m128i*) & (residual[y * 4 + 12]), diff);
+    }
     break;
   case 8:
-    diff = get_residual_8x1_avx2(&ref_in[0 * ref_stride], &pred_in[0 * pred_stride]);
-    _mm_storeu_si128((__m128i*) & (residual[0]), diff);
-    diff = get_residual_8x1_avx2(&ref_in[1 * ref_stride], &pred_in[1 * pred_stride]);
-    _mm_storeu_si128((__m128i*) & (residual[8]), diff);
-    diff = get_residual_8x1_avx2(&ref_in[2 * ref_stride], &pred_in[2 * pred_stride]);
-    _mm_storeu_si128((__m128i*) & (residual[16]), diff);
-    diff = get_residual_8x1_avx2(&ref_in[3 * ref_stride], &pred_in[3 * pred_stride]);
-    _mm_storeu_si128((__m128i*) & (residual[24]), diff);
-    diff = get_residual_8x1_avx2(&ref_in[4 * ref_stride], &pred_in[4 * pred_stride]);
-    _mm_storeu_si128((__m128i*) & (residual[32]), diff);
-    diff = get_residual_8x1_avx2(&ref_in[5 * ref_stride], &pred_in[5 * pred_stride]);
-    _mm_storeu_si128((__m128i*) & (residual[40]), diff);
-    diff = get_residual_8x1_avx2(&ref_in[6 * ref_stride], &pred_in[6 * pred_stride]);
-    _mm_storeu_si128((__m128i*) & (residual[48]), diff);
-    diff = get_residual_8x1_avx2(&ref_in[7 * ref_stride], &pred_in[7 * pred_stride]);
-    _mm_storeu_si128((__m128i*) & (residual[56]), diff);
+    for (int y = 0; y < height; y += 2) {
+      diff = get_residual_8x1_avx2(&ref_in[y * ref_stride], &pred_in[y * pred_stride]);
+      _mm_storeu_si128((__m128i*) & (residual[y * 8]), diff);
+      diff = get_residual_8x1_avx2(&ref_in[(y + 1) * ref_stride], &pred_in[(y + 1) * pred_stride]);
+      _mm_storeu_si128((__m128i*) & (residual[y*8 + 8]), diff);
+    }
     break;
   default:
-    for (int y = 0; y < width; ++y) {
+    for (int y = 0; y < height; ++y) {
       for (int x = 0; x < width; x += 16) {
         diff = get_residual_8x1_avx2(&ref_in[x + y * ref_stride], &pred_in[x + y * pred_stride]);
         _mm_storeu_si128((__m128i*) & residual[x + y * width], diff);
@@ -1816,15 +1808,15 @@ int uvg_strategy_register_picture_avx2(void* opaque, uint8_t bitdepth)
     success &= uvg_strategyselector_register(opaque, "satd_32x32", "avx2", 40, &satd_32x32_8bit_avx2);
     success &= uvg_strategyselector_register(opaque, "satd_64x64", "avx2", 40, &satd_64x64_8bit_avx2);
 
-    success &= uvg_strategyselector_register(opaque, "satd_4x4_dual", "avx2", 40, &satd_8bit_4x4_dual_avx2);
-    success &= uvg_strategyselector_register(opaque, "satd_8x8_dual", "avx2", 40, &satd_8bit_8x8_dual_avx2);
-    success &= uvg_strategyselector_register(opaque, "satd_16x16_dual", "avx2", 40, &satd_8bit_16x16_dual_avx2);
-    success &= uvg_strategyselector_register(opaque, "satd_32x32_dual", "avx2", 40, &satd_8bit_32x32_dual_avx2);
-    success &= uvg_strategyselector_register(opaque, "satd_64x64_dual", "avx2", 40, &satd_8bit_64x64_dual_avx2);
-    success &= uvg_strategyselector_register(opaque, "satd_any_size", "avx2", 40, &satd_any_size_8bit_avx2);
-    success &= uvg_strategyselector_register(opaque, "satd_any_size_quad", "avx2", 40, &satd_any_size_quad_avx2);
+    //success &= uvg_strategyselector_register(opaque, "satd_4x4_dual", "avx2", 40, &satd_8bit_4x4_dual_avx2);
+    //success &= uvg_strategyselector_register(opaque, "satd_8x8_dual", "avx2", 40, &satd_8bit_8x8_dual_avx2);
+    //success &= uvg_strategyselector_register(opaque, "satd_16x16_dual", "avx2", 40, &satd_8bit_16x16_dual_avx2);
+    //success &= uvg_strategyselector_register(opaque, "satd_32x32_dual", "avx2", 40, &satd_8bit_32x32_dual_avx2);
+    //success &= uvg_strategyselector_register(opaque, "satd_64x64_dual", "avx2", 40, &satd_8bit_64x64_dual_avx2);
+    //success &= uvg_strategyselector_register(opaque, "satd_any_size", "avx2", 40, &satd_any_size_8bit_avx2);
+    //success &= uvg_strategyselector_register(opaque, "satd_any_size_quad", "avx2", 40, &satd_any_size_quad_avx2);
 
-    success &= uvg_strategyselector_register(opaque, "pixels_calc_ssd", "avx2", 40, &pixels_calc_ssd_avx2);
+    //success &= uvg_strategyselector_register(opaque, "pixels_calc_ssd", "avx2", 40, &pixels_calc_ssd_avx2);
     success &= uvg_strategyselector_register(opaque, "bipred_average", "avx2", 40, &bipred_average_avx2);
     success &= uvg_strategyselector_register(opaque, "get_optimized_sad", "avx2", 40, &get_optimized_sad_avx2);
     success &= uvg_strategyselector_register(opaque, "ver_sad", "avx2", 40, &ver_sad_avx2);
diff --git a/src/strategies/avx2/quant-avx2.c b/src/strategies/avx2/quant-avx2.c
index 7729d272..bd857fa2 100644
--- a/src/strategies/avx2/quant-avx2.c
+++ b/src/strategies/avx2/quant-avx2.c
@@ -960,7 +960,7 @@ int uvg_strategy_register_quant_avx2(void* opaque, uint8_t bitdepth)
 #if COMPILE_INTEL_AVX2 && defined X86_64
 #if UVG_BIT_DEPTH == 8
   if (bitdepth == 8) {
-    success &= uvg_strategyselector_register(opaque, "quantize_residual", "avx2", 40, &uvg_quantize_residual_avx2);
+    //success &= uvg_strategyselector_register(opaque, "quantize_residual", "avx2", 40, &uvg_quantize_residual_avx2);
     success &= uvg_strategyselector_register(opaque, "dequant", "avx2", 40, &uvg_dequant_avx2);
   }
 #endif // UVG_BIT_DEPTH == 8

From 4dccbcc30d8989effde34c6311e15c637313d43a Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Mon, 24 Jul 2023 15:32:53 +0300
Subject: [PATCH 242/254] [avx2] Forward transforms seem to be working

---
 src/strategies/avx2/dct-avx2.c        | 6831 ++++++++++++++++++++++++-
 src/strategies/avx2/dct_avx2_tables.h | 4785 +++++++++++++++++
 2 files changed, 11441 insertions(+), 175 deletions(-)
 create mode 100644 src/strategies/avx2/dct_avx2_tables.h

diff --git a/src/strategies/avx2/dct-avx2.c b/src/strategies/avx2/dct-avx2.c
index bb8a92bc..f875a581 100644
--- a/src/strategies/avx2/dct-avx2.c
+++ b/src/strategies/avx2/dct-avx2.c
@@ -56,6 +56,11 @@ extern const int16_t uvg_g_dct_32_t[32][32];
 #include "uvg266.h"
 #if UVG_BIT_DEPTH == 8
 #include <immintrin.h>
+#include "strategies/avx2/dct_avx2_tables.h"
+#define MAX_LOG2_TR_DYNAMIC_RANGE 15
+#define TRANSFORM_MATRIX_SHIFT    6
+#define INVERSE_SHIFT_1ST (TRANSFORM_MATRIX_SHIFT + 1)
+#define INVERSE_SHIFT_2ND (TRANSFORM_MATRIX_SHIFT + MAX_LOG2_TR_DYNAMIC_RANGE - 1 - UVG_BIT_DEPTH)
 
 /*
 * \file
@@ -73,6 +78,583 @@ static INLINE __m256i truncate_avx2(__m256i v, __m256i debias, int32_t shift)
   return              _mm256_srai_epi32(truncable, shift);
 }
 
+
+// TODO: find avx2 solution for transpose
+// TODO: attempt to make a generic transpose for avx2. Needs some extra logic for different widths and heights.
+// TODO: make a few solutions for exact sizes and see if some pattern emerges...
+void transpose_matrix(const int16_t* src, int16_t* dst, const int width, const int height) {
+  const int sample_num = width * height;
+  const int vectors = sample_num / 16;
+
+  int16_t* d_ptr = dst;
+  if (vectors == 0) {
+    return;
+  }
+  else if (vectors == 1) {
+
+  }
+  else {
+    // Reserve enough storage for max transform size 32x32
+    __m256i v_16b_result[64];
+    __m256i v_32b_result[64];
+    __m256i v_64b_result[64];
+    __m256i v_128b_result[64];
+
+    // Handle two source vectors at a time
+    for (int i = 0; i < vectors; i += 2) {
+      __m256i v_src_0 = _mm256_load_si256((const __m256i*)src);
+      __m256i v_src_1 = _mm256_load_si256((const __m256i*)(src + 16));
+
+      v_16b_result[i] = _mm256_unpacklo_epi16(v_src_0, v_src_1);
+      v_16b_result[i + 1] = _mm256_unpackhi_epi16(v_src_0, v_src_1);
+
+      src += 32;
+    }
+
+    // 32 bit shuffle pass
+    int loop_idx = 0;
+    for (int i = 0; i < vectors; i += 2) {
+      const int idx_a = loop_idx;
+      const int idx_b = loop_idx + 2;
+
+      v_32b_result[i] = _mm256_unpacklo_epi32(v_16b_result[idx_a], v_16b_result[idx_b]);
+      v_32b_result[i + 1] = _mm256_unpackhi_epi32(v_16b_result[idx_a], v_16b_result[idx_b]);
+      loop_idx++;
+    }
+
+    // 64 bit shuffle pass
+    loop_idx = 0;
+    for (int i = 0; i < vectors; i += 2) {
+      const int idx_a = loop_idx;
+      const int idx_b = loop_idx + 4;
+
+      v_64b_result[i] = _mm256_unpacklo_epi32(v_32b_result[idx_a], v_32b_result[idx_b]);
+      v_64b_result[i + 1] = _mm256_unpackhi_epi32(v_32b_result[idx_a], v_32b_result[idx_b]);
+      loop_idx++;
+    }
+
+    // Final 128 bit shuffle pass
+    for (int i = 0; i < vectors; i += 2) {
+      const int idx_a = 0;
+      const int idx_b = 0;
+
+      v_128b_result[i] = _mm256_unpacklo_epi32(v_64b_result[idx_a], v_64b_result[idx_b]);
+      v_128b_result[i + 1] = _mm256_unpackhi_epi32(v_64b_result[idx_a], v_64b_result[idx_b]);
+    }
+
+    // Store loop
+    for (int i = 0; i < vectors; ++i) {
+      _mm256_store_si256((__m256i*)dst, v_128b_result[i]);
+      dst += 16;
+    }
+  }
+}
+
+void transpose_generic(const int16_t* src, int16_t* dst, const int width, const int height)
+{
+  for (int y = 0; y < height; ++y) {
+    for (int x = 0; x < width; ++x) {
+      dst[x * height + y] = src[y * width + x];
+    }
+  }
+}
+
+
+typedef void (transpose_func)(const __m256i* src, __m256i* dst);
+
+
+static void transpose_2x2_avx2(const __m256i* src, __m256i* dst){}
+static void transpose_2x4_avx2(const __m256i* src, __m256i* dst){}
+static void transpose_2x8_avx2(const __m256i* src, __m256i* dst){}
+static void transpose_2x16_avx2(const __m256i* src, __m256i* dst){}
+static void transpose_2x32_avx2(const __m256i* src, __m256i* dst)
+{
+  const __m256i v_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0246);
+  __m256i v_tmp[4];
+  v_tmp[0] = _mm256_shuffle_epi8(src[0], v_shuffle);
+  v_tmp[1] = _mm256_shuffle_epi8(src[1], v_shuffle);
+  v_tmp[2] = _mm256_shuffle_epi8(src[2], v_shuffle);
+  v_tmp[3] = _mm256_shuffle_epi8(src[3], v_shuffle);
+
+  v_tmp[0] = _mm256_permute4x64_epi64(v_tmp[0], _MM_SHUFFLE(3, 1, 2, 0));
+  v_tmp[1] = _mm256_permute4x64_epi64(v_tmp[1], _MM_SHUFFLE(3, 1, 2, 0));
+  v_tmp[2] = _mm256_permute4x64_epi64(v_tmp[2], _MM_SHUFFLE(3, 1, 2, 0));
+  v_tmp[3] = _mm256_permute4x64_epi64(v_tmp[3], _MM_SHUFFLE(3, 1, 2, 0));
+
+  dst[0] = _mm256_permute2x128_si256(v_tmp[0], v_tmp[1], 0x20);
+  dst[1] = _mm256_permute2x128_si256(v_tmp[2], v_tmp[3], 0x20);
+  dst[2] = _mm256_permute2x128_si256(v_tmp[0], v_tmp[1], 0x31);
+  dst[3] = _mm256_permute2x128_si256(v_tmp[2], v_tmp[3], 0x31);
+}
+static void transpose_2x64_avx2(const __m256i* src, __m256i* dst){}
+static void transpose_4x2_avx2(const __m256i* src, __m256i* dst){}
+static void transpose_4x4_avx2(const __m256i* src, __m256i* dst){}
+static void transpose_4x8_avx2(const __m256i* src, __m256i* dst){}
+static void transpose_4x16_avx2(const __m256i* src, __m256i* dst)
+{
+  const __m256i v_shuffle = _mm256_set_epi8(15, 14,  7,  6, 13, 12,  5,  4, 11, 10,  3,  2,  9,  8,  1,  0,
+                                            31, 30, 23, 22, 29, 28, 21, 20, 27, 26, 19, 18, 25, 24, 17, 16);
+
+  // const __m256i v_shuffle = _mm256_set_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+  //                                           16, 17, 24, 25, 18, 19, 26, 27, 20, 21, 28, 29, 22, 23, 30, 31);
+
+  __m256i v_src_tmp[4];
+  v_src_tmp[0] = _mm256_shuffle_epi8(src[0], v_shuffle);
+  v_src_tmp[1] = _mm256_shuffle_epi8(src[1], v_shuffle);
+  v_src_tmp[2] = _mm256_shuffle_epi8(src[2], v_shuffle);
+  v_src_tmp[3] = _mm256_shuffle_epi8(src[3], v_shuffle);
+
+  __m256i v_tmp[4];
+  v_tmp[0] = _mm256_permute2x128_si256(v_src_tmp[0], v_src_tmp[1], 0x20);
+  v_tmp[1] = _mm256_permute2x128_si256(v_src_tmp[0], v_src_tmp[1], 0x31);
+  v_tmp[2] = _mm256_permute2x128_si256(v_src_tmp[2], v_src_tmp[3], 0x20);
+  v_tmp[3] = _mm256_permute2x128_si256(v_src_tmp[2], v_src_tmp[3], 0x31);
+
+  __m256i v_tmp16_lo[2];
+  __m256i v_tmp16_hi[2];
+  v_tmp16_lo[0] = _mm256_unpacklo_epi32(v_tmp[0], v_tmp[1]);
+  v_tmp16_lo[1] = _mm256_unpacklo_epi32(v_tmp[2], v_tmp[3]);
+  v_tmp16_hi[0] = _mm256_unpackhi_epi32(v_tmp[0], v_tmp[1]);
+  v_tmp16_hi[1] = _mm256_unpackhi_epi32(v_tmp[2], v_tmp[3]);
+
+  v_tmp[0] = _mm256_permute4x64_epi64(v_tmp16_lo[0], _MM_SHUFFLE(3, 1, 2, 0));
+  v_tmp[1] = _mm256_permute4x64_epi64(v_tmp16_lo[1], _MM_SHUFFLE(3, 1, 2, 0));
+  v_tmp[2] = _mm256_permute4x64_epi64(v_tmp16_hi[0], _MM_SHUFFLE(3, 1, 2, 0));
+  v_tmp[3] = _mm256_permute4x64_epi64(v_tmp16_hi[1], _MM_SHUFFLE(3, 1, 2, 0));
+
+  dst[0] = _mm256_permute2x128_si256(v_tmp[0], v_tmp[1], 0x20);
+  dst[1] = _mm256_permute2x128_si256(v_tmp[0], v_tmp[1], 0x31);
+  dst[2] = _mm256_permute2x128_si256(v_tmp[2], v_tmp[3], 0x20);
+  dst[3] = _mm256_permute2x128_si256(v_tmp[2], v_tmp[3], 0x31);
+}
+static void transpose_4x32_avx2(const __m256i* src, __m256i* dst)
+{
+  __m256i v_tmp[8];
+  const __m256i v_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0415);
+  for (int i = 0; i < 8; ++i) {
+    v_tmp[i] = _mm256_shuffle_epi8(src[i], v_shuffle);
+    v_tmp[i] = _mm256_permute4x64_epi64(v_tmp[i], _MM_SHUFFLE(3, 1, 2, 0));
+    v_tmp[i] = _mm256_shuffle_epi32(v_tmp[i], _MM_SHUFFLE(3, 1, 2, 0));
+  }
+
+  __m256i v_tmp64_lo[4];
+  __m256i v_tmp64_hi[4];
+  v_tmp64_lo[0] = _mm256_unpacklo_epi64(v_tmp[0], v_tmp[1]);
+  v_tmp64_lo[1] = _mm256_unpacklo_epi64(v_tmp[2], v_tmp[3]);
+  v_tmp64_lo[2] = _mm256_unpacklo_epi64(v_tmp[4], v_tmp[5]);
+  v_tmp64_lo[3] = _mm256_unpacklo_epi64(v_tmp[6], v_tmp[7]);
+
+  v_tmp64_hi[0] = _mm256_unpackhi_epi64(v_tmp[0], v_tmp[1]);
+  v_tmp64_hi[1] = _mm256_unpackhi_epi64(v_tmp[2], v_tmp[3]);
+  v_tmp64_hi[2] = _mm256_unpackhi_epi64(v_tmp[4], v_tmp[5]);
+  v_tmp64_hi[3] = _mm256_unpackhi_epi64(v_tmp[6], v_tmp[7]);
+
+  dst[0] = _mm256_permute2x128_si256(v_tmp64_lo[0], v_tmp64_lo[1], 0x20);
+  dst[1] = _mm256_permute2x128_si256(v_tmp64_lo[2], v_tmp64_lo[3], 0x20);
+  dst[2] = _mm256_permute2x128_si256(v_tmp64_hi[0], v_tmp64_hi[1], 0x20);
+  dst[3] = _mm256_permute2x128_si256(v_tmp64_hi[2], v_tmp64_hi[3], 0x20);
+
+  dst[4] = _mm256_permute2x128_si256(v_tmp64_lo[0], v_tmp64_lo[1], 0x31);
+  dst[5] = _mm256_permute2x128_si256(v_tmp64_lo[2], v_tmp64_lo[3], 0x31);
+  dst[6] = _mm256_permute2x128_si256(v_tmp64_hi[0], v_tmp64_hi[1], 0x31);
+  dst[7] = _mm256_permute2x128_si256(v_tmp64_hi[2], v_tmp64_hi[3], 0x31);
+}
+static void transpose_4x64_avx2(const __m256i* src, __m256i* dst){}
+static void transpose_8x2_avx2(const __m256i* src, __m256i* dst){}
+static void transpose_8x4_avx2(const __m256i* src, __m256i* dst){}
+static void transpose_8x8_avx2(const __m256i* src, __m256i* dst){}
+static void transpose_8x16_avx2(const __m256i* src, __m256i* dst)
+{
+  __m256i v_tmp16_lo[4];
+  __m256i v_tmp16_hi[4];
+  __m256i v_tmp32_lo[4];
+  __m256i v_tmp32_hi[4];
+  __m256i v_tmp64_lo[4];
+  __m256i v_tmp64_hi[4];
+  __m256i v_tmp128[8];
+
+  v_tmp128[0] = _mm256_permute2x128_si256(src[0], src[4], 0x20);
+  v_tmp128[1] = _mm256_permute2x128_si256(src[0], src[4], 0x31);
+  v_tmp128[2] = _mm256_permute2x128_si256(src[1], src[5], 0x20);
+  v_tmp128[3] = _mm256_permute2x128_si256(src[1], src[5], 0x31);
+  v_tmp128[4] = _mm256_permute2x128_si256(src[2], src[6], 0x20);
+  v_tmp128[5] = _mm256_permute2x128_si256(src[2], src[6], 0x31);
+  v_tmp128[6] = _mm256_permute2x128_si256(src[3], src[7], 0x20);
+  v_tmp128[7] = _mm256_permute2x128_si256(src[3], src[7], 0x31);
+
+  v_tmp16_lo[0] = _mm256_unpacklo_epi16(v_tmp128[0], v_tmp128[1]);
+  v_tmp16_lo[1] = _mm256_unpacklo_epi16(v_tmp128[2], v_tmp128[3]);
+  v_tmp16_lo[2] = _mm256_unpacklo_epi16(v_tmp128[4], v_tmp128[5]);
+  v_tmp16_lo[3] = _mm256_unpacklo_epi16(v_tmp128[6], v_tmp128[7]);
+  v_tmp16_hi[0] = _mm256_unpackhi_epi16(v_tmp128[0], v_tmp128[1]);
+  v_tmp16_hi[1] = _mm256_unpackhi_epi16(v_tmp128[2], v_tmp128[3]);
+  v_tmp16_hi[2] = _mm256_unpackhi_epi16(v_tmp128[4], v_tmp128[5]);
+  v_tmp16_hi[3] = _mm256_unpackhi_epi16(v_tmp128[6], v_tmp128[7]);
+
+  v_tmp32_lo[0] = _mm256_unpacklo_epi32(v_tmp16_lo[0], v_tmp16_lo[1]);
+  v_tmp32_lo[1] = _mm256_unpacklo_epi32(v_tmp16_lo[2], v_tmp16_lo[3]);
+  v_tmp32_lo[2] = _mm256_unpacklo_epi32(v_tmp16_hi[0], v_tmp16_hi[1]);
+  v_tmp32_lo[3] = _mm256_unpacklo_epi32(v_tmp16_hi[2], v_tmp16_hi[3]);
+  v_tmp32_hi[0] = _mm256_unpackhi_epi32(v_tmp16_lo[0], v_tmp16_lo[1]);
+  v_tmp32_hi[1] = _mm256_unpackhi_epi32(v_tmp16_lo[2], v_tmp16_lo[3]);
+  v_tmp32_hi[2] = _mm256_unpackhi_epi32(v_tmp16_hi[0], v_tmp16_hi[1]);
+  v_tmp32_hi[3] = _mm256_unpackhi_epi32(v_tmp16_hi[2], v_tmp16_hi[3]);
+
+  dst[0] = _mm256_unpacklo_epi64(v_tmp32_lo[0], v_tmp32_lo[1]);
+  dst[1] = _mm256_unpackhi_epi64(v_tmp32_lo[0], v_tmp32_lo[1]);
+  dst[2] = _mm256_unpacklo_epi64(v_tmp32_hi[0], v_tmp32_hi[1]);
+  dst[3] = _mm256_unpackhi_epi64(v_tmp32_hi[0], v_tmp32_hi[1]);
+  dst[4] = _mm256_unpacklo_epi64(v_tmp32_lo[2], v_tmp32_lo[3]);
+  dst[5] = _mm256_unpackhi_epi64(v_tmp32_lo[2], v_tmp32_lo[3]);
+  dst[6] = _mm256_unpacklo_epi64(v_tmp32_hi[2], v_tmp32_hi[3]);
+  dst[7] = _mm256_unpackhi_epi64(v_tmp32_hi[2], v_tmp32_hi[3]);
+}
+static void transpose_8x32_avx2(const __m256i* src, __m256i* dst)
+{
+  __m256i v_tmp16_lo[8];
+  __m256i v_tmp16_hi[8];
+  __m256i v_tmp32_lo[8];
+  __m256i v_tmp32_hi[8];
+  __m256i v_tmp64_lo[8];
+  __m256i v_tmp64_hi[8];
+
+  const __m256i v_shuffle = _mm256_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+    16, 17, 24, 25, 18, 19, 26, 27, 20, 21, 28, 29, 22, 23, 30, 31);
+  for (int i = 0; i < 8; ++i) {
+    const int offset = i * 2;
+    v_tmp16_lo[i] = _mm256_unpacklo_epi16(src[offset], src[offset + 1]);
+    v_tmp16_hi[i] = _mm256_unpackhi_epi16(src[offset], src[offset + 1]);
+  }
+
+  for (int i = 0; i < 8; i += 4) {
+    v_tmp32_lo[i + 0] = _mm256_unpacklo_epi32(v_tmp16_lo[i + 0], v_tmp16_lo[i + 1]);
+    v_tmp32_lo[i + 1] = _mm256_unpacklo_epi32(v_tmp16_lo[i + 2], v_tmp16_lo[i + 3]);
+    v_tmp32_lo[i + 2] = _mm256_unpacklo_epi32(v_tmp16_hi[i + 0], v_tmp16_hi[i + 1]);
+    v_tmp32_lo[i + 3] = _mm256_unpacklo_epi32(v_tmp16_hi[i + 2], v_tmp16_hi[i + 3]);
+
+    v_tmp32_hi[i + 0] = _mm256_unpackhi_epi32(v_tmp16_lo[i + 0], v_tmp16_lo[i + 1]);
+    v_tmp32_hi[i + 1] = _mm256_unpackhi_epi32(v_tmp16_lo[i + 2], v_tmp16_lo[i + 3]);
+    v_tmp32_hi[i + 2] = _mm256_unpackhi_epi32(v_tmp16_hi[i + 0], v_tmp16_hi[i + 1]);
+    v_tmp32_hi[i + 3] = _mm256_unpackhi_epi32(v_tmp16_hi[i + 2], v_tmp16_hi[i + 3]);
+  }
+
+  for (int i = 0; i < 8; i += 4) {
+    v_tmp64_lo[i + 0] = _mm256_unpacklo_epi64(v_tmp32_lo[i + 0], v_tmp32_lo[i + 1]);
+    v_tmp64_lo[i + 1] = _mm256_unpacklo_epi64(v_tmp32_lo[i + 2], v_tmp32_lo[i + 3]);
+    v_tmp64_lo[i + 2] = _mm256_unpacklo_epi64(v_tmp32_hi[i + 0], v_tmp32_hi[i + 1]);
+    v_tmp64_lo[i + 3] = _mm256_unpacklo_epi64(v_tmp32_hi[i + 2], v_tmp32_hi[i + 3]);
+
+    v_tmp64_hi[i + 0] = _mm256_unpackhi_epi64(v_tmp32_lo[i + 0], v_tmp32_lo[i + 1]);
+    v_tmp64_hi[i + 1] = _mm256_unpackhi_epi64(v_tmp32_lo[i + 2], v_tmp32_lo[i + 3]);
+    v_tmp64_hi[i + 2] = _mm256_unpackhi_epi64(v_tmp32_hi[i + 0], v_tmp32_hi[i + 1]);
+    v_tmp64_hi[i + 3] = _mm256_unpackhi_epi64(v_tmp32_hi[i + 2], v_tmp32_hi[i + 3]);
+  }
+
+  for (int i = 0; i < 8; ++i) {
+    v_tmp64_lo[i] = _mm256_permute4x64_epi64(v_tmp64_lo[i], _MM_SHUFFLE(3, 1, 2, 0));
+    v_tmp64_hi[i] = _mm256_permute4x64_epi64(v_tmp64_hi[i], _MM_SHUFFLE(3, 1, 2, 0));
+  }
+
+  dst[0] = _mm256_shuffle_epi8(v_tmp64_lo[0], v_shuffle);
+  dst[1] = _mm256_shuffle_epi8(v_tmp64_lo[4], v_shuffle);
+  dst[2] = _mm256_shuffle_epi8(v_tmp64_hi[0], v_shuffle);
+  dst[3] = _mm256_shuffle_epi8(v_tmp64_hi[4], v_shuffle);
+
+  dst[4] = _mm256_shuffle_epi8(v_tmp64_lo[2], v_shuffle);
+  dst[5] = _mm256_shuffle_epi8(v_tmp64_lo[6], v_shuffle);
+  dst[6] = _mm256_shuffle_epi8(v_tmp64_hi[2], v_shuffle);
+  dst[7] = _mm256_shuffle_epi8(v_tmp64_hi[6], v_shuffle);
+
+  dst[8] = _mm256_shuffle_epi8(v_tmp64_lo[1], v_shuffle);
+  dst[9] = _mm256_shuffle_epi8(v_tmp64_lo[5], v_shuffle);
+  dst[10] = _mm256_shuffle_epi8(v_tmp64_hi[1], v_shuffle);
+  dst[11] = _mm256_shuffle_epi8(v_tmp64_hi[5], v_shuffle);
+
+  dst[12] = _mm256_shuffle_epi8(v_tmp64_lo[3], v_shuffle);
+  dst[13] = _mm256_shuffle_epi8(v_tmp64_lo[7], v_shuffle);
+  dst[14] = _mm256_shuffle_epi8(v_tmp64_hi[3], v_shuffle);
+  dst[15] = _mm256_shuffle_epi8(v_tmp64_hi[7], v_shuffle);
+}
+static void transpose_8x64_avx2(const __m256i* src, __m256i* dst){}
+static void transpose_16x2_avx2(const __m256i* src, __m256i* dst){}
+static void transpose_16x4_avx2(const __m256i* src, __m256i* dst)
+{
+  __m256i v_tmp16_lo[2];
+  __m256i v_tmp16_hi[2];
+  __m256i v_tmp32_lo[2];
+  __m256i v_tmp32_hi[2];
+
+  v_tmp16_lo[0] = _mm256_unpacklo_epi16(src[0], src[1]);
+  v_tmp16_lo[1] = _mm256_unpacklo_epi16(src[2], src[3]);
+  v_tmp16_hi[0] = _mm256_unpackhi_epi16(src[0], src[1]);
+  v_tmp16_hi[1] = _mm256_unpackhi_epi16(src[2], src[3]);
+
+  v_tmp32_lo[0] = _mm256_unpacklo_epi32(v_tmp16_lo[0], v_tmp16_lo[1]);
+  v_tmp32_lo[1] = _mm256_unpacklo_epi32(v_tmp16_hi[0], v_tmp16_hi[1]);
+
+  v_tmp32_hi[0] = _mm256_unpackhi_epi32(v_tmp16_lo[0], v_tmp16_lo[1]);
+  v_tmp32_hi[1] = _mm256_unpackhi_epi32(v_tmp16_hi[0], v_tmp16_hi[1]);
+
+  dst[0] = _mm256_permute2x128_si256(v_tmp32_lo[0], v_tmp32_hi[0], 0x20);
+  dst[1] = _mm256_permute2x128_si256(v_tmp32_lo[1], v_tmp32_hi[1], 0x20);
+  dst[2] = _mm256_permute2x128_si256(v_tmp32_lo[0], v_tmp32_hi[0], 0x31);
+  dst[3] = _mm256_permute2x128_si256(v_tmp32_lo[1], v_tmp32_hi[1], 0x31);
+}
+static void transpose_16x8_avx2(const __m256i* src, __m256i* dst)
+{
+  __m256i v_tmp16_lo[4];
+  __m256i v_tmp16_hi[4];
+  __m256i v_tmp32_lo[4];
+  __m256i v_tmp32_hi[4];
+  __m256i v_tmp64_lo[4];
+  __m256i v_tmp64_hi[4];
+  v_tmp16_lo[0] = _mm256_unpacklo_epi16(src[0], src[1]);
+  v_tmp16_lo[1] = _mm256_unpacklo_epi16(src[2], src[3]);
+  v_tmp16_lo[2] = _mm256_unpacklo_epi16(src[4], src[5]);
+  v_tmp16_lo[3] = _mm256_unpacklo_epi16(src[6], src[7]);
+  v_tmp16_hi[0] = _mm256_unpackhi_epi16(src[0], src[1]);
+  v_tmp16_hi[1] = _mm256_unpackhi_epi16(src[2], src[3]);
+  v_tmp16_hi[2] = _mm256_unpackhi_epi16(src[4], src[5]);
+  v_tmp16_hi[3] = _mm256_unpackhi_epi16(src[6], src[7]);
+
+  v_tmp32_lo[0] = _mm256_unpacklo_epi32(v_tmp16_lo[0], v_tmp16_lo[1]);
+  v_tmp32_lo[1] = _mm256_unpacklo_epi32(v_tmp16_lo[2], v_tmp16_lo[3]);
+  v_tmp32_lo[2] = _mm256_unpacklo_epi32(v_tmp16_hi[0], v_tmp16_hi[1]);
+  v_tmp32_lo[3] = _mm256_unpacklo_epi32(v_tmp16_hi[2], v_tmp16_hi[3]);
+  v_tmp32_hi[0] = _mm256_unpackhi_epi32(v_tmp16_lo[0], v_tmp16_lo[1]);
+  v_tmp32_hi[1] = _mm256_unpackhi_epi32(v_tmp16_lo[2], v_tmp16_lo[3]);
+  v_tmp32_hi[2] = _mm256_unpackhi_epi32(v_tmp16_hi[0], v_tmp16_hi[1]);
+  v_tmp32_hi[3] = _mm256_unpackhi_epi32(v_tmp16_hi[2], v_tmp16_hi[3]);
+
+  v_tmp64_lo[0] = _mm256_unpacklo_epi64(v_tmp32_lo[0], v_tmp32_lo[1]);
+  v_tmp64_lo[1] = _mm256_unpacklo_epi64(v_tmp32_lo[2], v_tmp32_lo[3]);
+  v_tmp64_lo[2] = _mm256_unpacklo_epi64(v_tmp32_hi[0], v_tmp32_hi[1]);
+  v_tmp64_lo[3] = _mm256_unpacklo_epi64(v_tmp32_hi[2], v_tmp32_hi[3]);
+  v_tmp64_hi[0] = _mm256_unpackhi_epi64(v_tmp32_lo[0], v_tmp32_lo[1]);
+  v_tmp64_hi[1] = _mm256_unpackhi_epi64(v_tmp32_lo[2], v_tmp32_lo[3]);
+  v_tmp64_hi[2] = _mm256_unpackhi_epi64(v_tmp32_hi[0], v_tmp32_hi[1]);
+  v_tmp64_hi[3] = _mm256_unpackhi_epi64(v_tmp32_hi[2], v_tmp32_hi[3]);
+
+  dst[0] = _mm256_permute2x128_si256(v_tmp64_lo[0], v_tmp64_hi[0], 0x20);
+  dst[1] = _mm256_permute2x128_si256(v_tmp64_lo[2], v_tmp64_hi[2], 0x20);
+  dst[2] = _mm256_permute2x128_si256(v_tmp64_lo[1], v_tmp64_hi[1], 0x20);
+  dst[3] = _mm256_permute2x128_si256(v_tmp64_lo[3], v_tmp64_hi[3], 0x20);
+  dst[4] = _mm256_permute2x128_si256(v_tmp64_lo[0], v_tmp64_hi[0], 0x31);
+  dst[5] = _mm256_permute2x128_si256(v_tmp64_lo[2], v_tmp64_hi[2], 0x31);
+  dst[6] = _mm256_permute2x128_si256(v_tmp64_lo[1], v_tmp64_hi[1], 0x31);
+  dst[7] = _mm256_permute2x128_si256(v_tmp64_lo[3], v_tmp64_hi[3], 0x31);
+}
+
+static void transpose_16x16_avx2_stride(const int16_t* src, int16_t* dst, const int src_stride, const int dst_stride) {
+  __m256i v_tmp16_lo[8];
+  __m256i v_tmp16_hi[8];
+  for (int d = 0, s = 0; d < 8; ++d, s += 2) {
+    v_tmp16_lo[d] = _mm256_unpacklo_epi16(*(__m256i*)(src + s * src_stride), *(__m256i*)(src + (s + 1) * src_stride));
+    v_tmp16_hi[d] = _mm256_unpackhi_epi16(*(__m256i*)(src + s * src_stride), *(__m256i*)(src + (s + 1) * src_stride));
+  }
+
+  __m256i v_tmp32_lo[8];
+  __m256i v_tmp32_hi[8];
+  for (int d = 0, s = 0; d < 8; d += 2, s += 2) {
+    v_tmp32_lo[d + 0] = _mm256_unpacklo_epi32(v_tmp16_lo[s + 0], v_tmp16_lo[s + 1]);
+    v_tmp32_lo[d + 1] = _mm256_unpacklo_epi32(v_tmp16_hi[s + 0], v_tmp16_hi[s + 1]);
+    v_tmp32_hi[d + 0] = _mm256_unpackhi_epi32(v_tmp16_lo[s + 0], v_tmp16_lo[s + 1]);
+    v_tmp32_hi[d + 1] = _mm256_unpackhi_epi32(v_tmp16_hi[s + 0], v_tmp16_hi[s + 1]);
+  }
+
+  __m256i v_tmp64_lo[8];
+  __m256i v_tmp64_hi[8];
+  for (int d = 0, s = 0; d < 8; d += 4, s += 4) {
+    v_tmp64_lo[d + 0] = _mm256_unpacklo_epi64(v_tmp32_lo[s + 0], v_tmp32_lo[s + 2]);
+    v_tmp64_lo[d + 1] = _mm256_unpacklo_epi64(v_tmp32_lo[s + 1], v_tmp32_lo[s + 3]);
+    v_tmp64_hi[d + 0] = _mm256_unpackhi_epi64(v_tmp32_lo[s + 0], v_tmp32_lo[s + 2]);
+    v_tmp64_hi[d + 1] = _mm256_unpackhi_epi64(v_tmp32_lo[s + 1], v_tmp32_lo[s + 3]);
+
+    v_tmp64_lo[d + 2] = _mm256_unpacklo_epi64(v_tmp32_hi[s + 0], v_tmp32_hi[s + 2]);
+    v_tmp64_lo[d + 3] = _mm256_unpacklo_epi64(v_tmp32_hi[s + 1], v_tmp32_hi[s + 3]);
+    v_tmp64_hi[d + 2] = _mm256_unpackhi_epi64(v_tmp32_hi[s + 0], v_tmp32_hi[s + 2]);
+    v_tmp64_hi[d + 3] = _mm256_unpackhi_epi64(v_tmp32_hi[s + 1], v_tmp32_hi[s + 3]);
+  }
+
+  _mm256_storeu_si256((__m256i*)(dst + 0 * dst_stride), _mm256_permute2x128_si256(v_tmp64_lo[0], v_tmp64_lo[4], 0x20));
+  _mm256_storeu_si256((__m256i*)(dst + 1 * dst_stride), _mm256_permute2x128_si256(v_tmp64_hi[0], v_tmp64_hi[4], 0x20));
+  _mm256_storeu_si256((__m256i*)(dst + 2 * dst_stride), _mm256_permute2x128_si256(v_tmp64_lo[2], v_tmp64_lo[6], 0x20));
+  _mm256_storeu_si256((__m256i*)(dst + 3 * dst_stride), _mm256_permute2x128_si256(v_tmp64_hi[2], v_tmp64_hi[6], 0x20));
+  _mm256_storeu_si256((__m256i*)(dst + 4 * dst_stride), _mm256_permute2x128_si256(v_tmp64_lo[1], v_tmp64_lo[5], 0x20));
+  _mm256_storeu_si256((__m256i*)(dst + 5 * dst_stride), _mm256_permute2x128_si256(v_tmp64_hi[1], v_tmp64_hi[5], 0x20));
+  _mm256_storeu_si256((__m256i*)(dst + 6 * dst_stride), _mm256_permute2x128_si256(v_tmp64_lo[3], v_tmp64_lo[7], 0x20));
+  _mm256_storeu_si256((__m256i*)(dst + 7 * dst_stride), _mm256_permute2x128_si256(v_tmp64_hi[3], v_tmp64_hi[7], 0x20));
+
+  _mm256_storeu_si256((__m256i*)(dst + 8 * dst_stride), _mm256_permute2x128_si256(v_tmp64_lo[0], v_tmp64_lo[4], 0x31));
+  _mm256_storeu_si256((__m256i*)(dst + 9 * dst_stride), _mm256_permute2x128_si256(v_tmp64_hi[0], v_tmp64_hi[4], 0x31));
+  _mm256_storeu_si256((__m256i*)(dst + 10 * dst_stride), _mm256_permute2x128_si256(v_tmp64_lo[2], v_tmp64_lo[6], 0x31));
+  _mm256_storeu_si256((__m256i*)(dst + 11 * dst_stride), _mm256_permute2x128_si256(v_tmp64_hi[2], v_tmp64_hi[6], 0x31));
+  _mm256_storeu_si256((__m256i*)(dst + 12 * dst_stride), _mm256_permute2x128_si256(v_tmp64_lo[1], v_tmp64_lo[5], 0x31));
+  _mm256_storeu_si256((__m256i*)(dst + 13 * dst_stride), _mm256_permute2x128_si256(v_tmp64_hi[1], v_tmp64_hi[5], 0x31));
+  _mm256_storeu_si256((__m256i*)(dst + 14 * dst_stride), _mm256_permute2x128_si256(v_tmp64_lo[3], v_tmp64_lo[7], 0x31));
+  _mm256_storeu_si256((__m256i*)(dst + 15 * dst_stride), _mm256_permute2x128_si256(v_tmp64_hi[3], v_tmp64_hi[7], 0x31));
+}
+
+static void transpose_16x16_avx2(const __m256i* src, __m256i* dst) {
+  transpose_16x16_avx2_stride((int16_t const *)src, (int16_t*)dst, 16, 16);
+}
+
+static void transpose_16x32_avx2(const __m256i* src, __m256i* dst) {
+  transpose_16x16_avx2_stride((int16_t const *)src, (int16_t*)dst, 16, 32);
+  transpose_16x16_avx2_stride((int16_t const *)src + 16 * 16, (int16_t*)dst + 16, 16, 32);
+
+}
+static void transpose_16x64_avx2(const __m256i* src, __m256i* dst){}
+static void transpose_32x2_avx2(const __m256i* src, __m256i* dst)
+{
+  __m256i v_tmp16_lo0 = _mm256_unpacklo_epi16(src[0], src[2]);
+  __m256i v_tmp16_lo1 = _mm256_unpacklo_epi16(src[1], src[3]);
+  __m256i v_tmp16_hi0 = _mm256_unpackhi_epi16(src[0], src[2]);
+  __m256i v_tmp16_hi1 = _mm256_unpackhi_epi16(src[1], src[3]);
+
+  dst[0] = _mm256_permute2x128_si256(v_tmp16_lo0, v_tmp16_hi0, 0x20);
+  dst[1] = _mm256_permute2x128_si256(v_tmp16_lo0, v_tmp16_hi0, 0x31);
+  dst[2] = _mm256_permute2x128_si256(v_tmp16_lo1, v_tmp16_hi1, 0x20);
+  dst[3] = _mm256_permute2x128_si256(v_tmp16_lo1, v_tmp16_hi1, 0x31);
+}
+static void transpose_32x4_avx2(const __m256i* src, __m256i* dst)
+{
+  __m256i v_tmp16_lo[4];
+  __m256i v_tmp16_hi[4];
+  v_tmp16_lo[0] = _mm256_unpacklo_epi16(src[0], src[2]);
+  v_tmp16_lo[1] = _mm256_unpacklo_epi16(src[1], src[3]);
+  v_tmp16_lo[2] = _mm256_unpacklo_epi16(src[4], src[6]);
+  v_tmp16_lo[3] = _mm256_unpacklo_epi16(src[5], src[7]);
+
+  v_tmp16_hi[0] = _mm256_unpackhi_epi16(src[0], src[2]);
+  v_tmp16_hi[1] = _mm256_unpackhi_epi16(src[1], src[3]);
+  v_tmp16_hi[2] = _mm256_unpackhi_epi16(src[4], src[6]);
+  v_tmp16_hi[3] = _mm256_unpackhi_epi16(src[5], src[7]);
+
+  __m256i v_tmp32_lo[4];
+  __m256i v_tmp32_hi[4];
+  v_tmp32_lo[0] = _mm256_unpacklo_epi32(v_tmp16_lo[0], v_tmp16_lo[2]);
+  v_tmp32_lo[1] = _mm256_unpacklo_epi32(v_tmp16_lo[1], v_tmp16_lo[3]);
+  v_tmp32_lo[2] = _mm256_unpacklo_epi32(v_tmp16_hi[0], v_tmp16_hi[2]);
+  v_tmp32_lo[3] = _mm256_unpacklo_epi32(v_tmp16_hi[1], v_tmp16_hi[3]);
+
+  v_tmp32_hi[0] = _mm256_unpackhi_epi32(v_tmp16_lo[0], v_tmp16_lo[2]);
+  v_tmp32_hi[1] = _mm256_unpackhi_epi32(v_tmp16_lo[1], v_tmp16_lo[3]);
+  v_tmp32_hi[2] = _mm256_unpackhi_epi32(v_tmp16_hi[0], v_tmp16_hi[2]);
+  v_tmp32_hi[3] = _mm256_unpackhi_epi32(v_tmp16_hi[1], v_tmp16_hi[3]);
+
+  dst[0] = _mm256_permute2x128_si256(v_tmp32_lo[0], v_tmp32_hi[0], 0x20);
+  dst[1] = _mm256_permute2x128_si256(v_tmp32_lo[2], v_tmp32_hi[2], 0x20);
+  dst[2] = _mm256_permute2x128_si256(v_tmp32_lo[0], v_tmp32_hi[0], 0x31);
+  dst[3] = _mm256_permute2x128_si256(v_tmp32_lo[2], v_tmp32_hi[2], 0x31);
+
+  dst[4] = _mm256_permute2x128_si256(v_tmp32_lo[1], v_tmp32_hi[1], 0x20);
+  dst[5] = _mm256_permute2x128_si256(v_tmp32_lo[3], v_tmp32_hi[3], 0x20);
+  dst[6] = _mm256_permute2x128_si256(v_tmp32_lo[1], v_tmp32_hi[1], 0x31);
+  dst[7] = _mm256_permute2x128_si256(v_tmp32_lo[3], v_tmp32_hi[3], 0x31);
+}
+static void transpose_32x8_avx2(const __m256i* src, __m256i* dst)
+{
+  __m256i v_tmp16_lo[8];
+  __m256i v_tmp16_hi[8];
+  for (int d = 0, s = 0; d < 8; d += 2, s += 4) {
+    v_tmp16_lo[d + 0] = _mm256_unpacklo_epi16(src[s + 0], src[s + 2]);
+    v_tmp16_lo[d + 1] = _mm256_unpacklo_epi16(src[s + 1], src[s + 3]);
+
+    v_tmp16_hi[d + 0] = _mm256_unpackhi_epi16(src[s + 0], src[s + 2]);
+    v_tmp16_hi[d + 1] = _mm256_unpackhi_epi16(src[s + 1], src[s + 3]);
+  }
+
+  __m256i v_tmp32_lo[8];
+  __m256i v_tmp32_hi[8];
+  for (int d = 0, s = 0; d < 4; d += 2, s += 4) {
+    v_tmp32_lo[d + 0] = _mm256_unpacklo_epi32(v_tmp16_lo[s + 0], v_tmp16_lo[s + 2]);
+    v_tmp32_lo[d + 1] = _mm256_unpacklo_epi32(v_tmp16_lo[s + 1], v_tmp16_lo[s + 3]);
+    v_tmp32_lo[d + 4] = _mm256_unpacklo_epi32(v_tmp16_hi[s + 0], v_tmp16_hi[s + 2]);
+    v_tmp32_lo[d + 5] = _mm256_unpacklo_epi32(v_tmp16_hi[s + 1], v_tmp16_hi[s + 3]);
+
+    v_tmp32_hi[d + 0] = _mm256_unpackhi_epi32(v_tmp16_lo[s + 0], v_tmp16_lo[s + 2]);
+    v_tmp32_hi[d + 1] = _mm256_unpackhi_epi32(v_tmp16_lo[s + 1], v_tmp16_lo[s + 3]);
+    v_tmp32_hi[d + 4] = _mm256_unpackhi_epi32(v_tmp16_hi[s + 0], v_tmp16_hi[s + 2]);
+    v_tmp32_hi[d + 5] = _mm256_unpackhi_epi32(v_tmp16_hi[s + 1], v_tmp16_hi[s + 3]);
+  }
+
+  __m256i v_tmp64_lo[8];
+  __m256i v_tmp64_hi[8];
+  for (int d = 0, s = 0; d < 4; d += 2, s += 4) {
+    v_tmp64_lo[d + 0] = _mm256_unpacklo_epi64(v_tmp32_lo[s + 0], v_tmp32_lo[s + 2]);
+    v_tmp64_lo[d + 1] = _mm256_unpacklo_epi64(v_tmp32_lo[s + 1], v_tmp32_lo[s + 3]);
+    v_tmp64_lo[d + 4] = _mm256_unpacklo_epi64(v_tmp32_hi[s + 0], v_tmp32_hi[s + 2]);
+    v_tmp64_lo[d + 5] = _mm256_unpacklo_epi64(v_tmp32_hi[s + 1], v_tmp32_hi[s + 3]);
+
+    v_tmp64_hi[d + 0] = _mm256_unpackhi_epi64(v_tmp32_lo[s + 0], v_tmp32_lo[s + 2]);
+    v_tmp64_hi[d + 1] = _mm256_unpackhi_epi64(v_tmp32_lo[s + 1], v_tmp32_lo[s + 3]);
+    v_tmp64_hi[d + 4] = _mm256_unpackhi_epi64(v_tmp32_hi[s + 0], v_tmp32_hi[s + 2]);
+    v_tmp64_hi[d + 5] = _mm256_unpackhi_epi64(v_tmp32_hi[s + 1], v_tmp32_hi[s + 3]);
+  }
+
+  dst[0] = _mm256_permute2x128_si256(v_tmp64_lo[0], v_tmp64_hi[0], 0x20);
+  dst[1] = _mm256_permute2x128_si256(v_tmp64_lo[4], v_tmp64_hi[4], 0x20);
+  dst[2] = _mm256_permute2x128_si256(v_tmp64_lo[2], v_tmp64_hi[2], 0x20);
+  dst[3] = _mm256_permute2x128_si256(v_tmp64_lo[6], v_tmp64_hi[6], 0x20);
+
+  dst[4] = _mm256_permute2x128_si256(v_tmp64_lo[0], v_tmp64_hi[0], 0x31);
+  dst[5] = _mm256_permute2x128_si256(v_tmp64_lo[4], v_tmp64_hi[4], 0x31);
+  dst[6] = _mm256_permute2x128_si256(v_tmp64_lo[2], v_tmp64_hi[2], 0x31);
+  dst[7] = _mm256_permute2x128_si256(v_tmp64_lo[6], v_tmp64_hi[6], 0x31);
+
+  dst[8]  = _mm256_permute2x128_si256(v_tmp64_lo[1], v_tmp64_hi[1], 0x20);
+  dst[9]  = _mm256_permute2x128_si256(v_tmp64_lo[5], v_tmp64_hi[5], 0x20);
+  dst[10] = _mm256_permute2x128_si256(v_tmp64_lo[3], v_tmp64_hi[3], 0x20);
+  dst[11] = _mm256_permute2x128_si256(v_tmp64_lo[7], v_tmp64_hi[7], 0x20);
+
+  dst[12] = _mm256_permute2x128_si256(v_tmp64_lo[1], v_tmp64_hi[1], 0x31);
+  dst[13] = _mm256_permute2x128_si256(v_tmp64_lo[5], v_tmp64_hi[5], 0x31);
+  dst[14] = _mm256_permute2x128_si256(v_tmp64_lo[3], v_tmp64_hi[3], 0x31);
+  dst[15] = _mm256_permute2x128_si256(v_tmp64_lo[7], v_tmp64_hi[7], 0x31);
+}
+static void transpose_32x16_avx2(const __m256i* src, __m256i* dst) {
+  transpose_16x16_avx2_stride((int16_t const *)src,                        (int16_t *)dst, 32, 16);
+  transpose_16x16_avx2_stride((int16_t const *)src + 16,           (int16_t *)dst + 16 * 16, 32, 16);
+}
+static void transpose_32x32_avx2(const __m256i* src, __m256i* dst) {
+  transpose_16x16_avx2_stride((int16_t const *)src,                        (int16_t *)dst, 32, 32);
+  transpose_16x16_avx2_stride((int16_t const *)src + 16,           (int16_t *)dst + 16 * 32, 32, 32);
+  transpose_16x16_avx2_stride((int16_t const *)src + 16 * 32,      (int16_t *)dst + 16, 32, 32);
+  transpose_16x16_avx2_stride((int16_t const *)src + 16 * 32 + 16, (int16_t *)dst + 16 * 32 + 16, 32, 32);
+}
+static void transpose_32x64_avx2(const __m256i* src, __m256i* dst){}
+static void transpose_64x2_avx2(const __m256i* src, __m256i* dst){}
+static void transpose_64x4_avx2(const __m256i* src, __m256i* dst){}
+static void transpose_64x8_avx2(const __m256i* src, __m256i* dst){}
+static void transpose_64x16_avx2(const __m256i* src, __m256i* dst){}
+static void transpose_64x32_avx2(const __m256i* src, __m256i* dst){}
+static void transpose_64x64_avx2(const __m256i* src, __m256i* dst){}
+
+
+
+static transpose_func* transpose_func_table[6][6] = {
+  { transpose_2x2_avx2,  transpose_4x2_avx2,  transpose_8x2_avx2,  transpose_16x2_avx2,  transpose_32x2_avx2,  transpose_64x2_avx2},
+  { transpose_2x4_avx2,  transpose_4x4_avx2,  transpose_8x4_avx2,  transpose_16x4_avx2,  transpose_32x4_avx2,  transpose_64x4_avx2},
+  { transpose_2x8_avx2,  transpose_4x8_avx2,  transpose_8x8_avx2,  transpose_16x8_avx2,  transpose_32x8_avx2,  transpose_64x8_avx2},
+  {transpose_2x16_avx2, transpose_4x16_avx2, transpose_8x16_avx2, transpose_16x16_avx2, transpose_32x16_avx2, transpose_64x16_avx2},
+  {transpose_2x32_avx2, transpose_4x32_avx2, transpose_8x32_avx2, transpose_16x32_avx2, transpose_32x32_avx2, transpose_64x32_avx2},
+  {transpose_2x64_avx2, transpose_4x64_avx2, transpose_8x64_avx2, transpose_16x64_avx2, transpose_32x64_avx2, transpose_64x64_avx2},
+};
+
+
+// Dispatcher function for avx2 transposes. This calls the proper subfunction
+void transpose_avx2(const __m256i* src, __m256i* dst, const int width, const int height)
+{
+  // No need to transpose something of width or height 1
+  const int w_log2_minus1 = uvg_g_convert_to_log2[width] - 1;
+  const int h_log2_minus1 = uvg_g_convert_to_log2[height] - 1;
+  
+  transpose_func* func = transpose_func_table[h_log2_minus1][w_log2_minus1];
+  func(src, dst);
+}
+
+
 // 4x4 matrix multiplication with value clipping.
 // Parameters: Two 4x4 matrices containing 16-bit values in consecutive addresses,
 //             destination for the result and the shift value for clipping.
@@ -945,12 +1527,6 @@ ITRANSFORM(dct, 32);
 /*****************************************************/
 
 // DST-7
-#define DEFINE_DST7_P4_MATRIX(a,b,c,d) { \
-    { a,  b,  c,  d},\
-    { c,  c,  0, -c},\
-    { d, -a, -c,  b},\
-    { b, -d,  c, -a},\
-}
 
 #define DEFINE_DST7_P4_MATRIX_T(a,b,c,d) { \
     { a,  c,  d,  b},\
@@ -959,17 +1535,6 @@ ITRANSFORM(dct, 32);
     { d, -c,  b, -a},\
 }
 
-#define DEFINE_DST7_P8_MATRIX(a,b,c,d,e,f,g,h) \
-{\
-   { a,  b,  c,  d,  e,  f,  g,  h},\
-   { c,  f,  h,  e,  b, -a, -d, -g},\
-   { e,  g,  b, -c, -h, -d,  a,  f},\
-   { g,  c, -d, -f,  a,  h,  b, -e},\
-   { h, -a, -g,  b,  f, -c, -e,  d},\
-   { f, -e, -a,  g, -d, -b,  h, -c},\
-   { d, -h,  e, -a, -c,  g, -f,  b},\
-   { b, -d,  f, -h,  g, -e,  c, -a},\
-}
 
 #define DEFINE_DST7_P8_MATRIX_T(a,b,c,d,e,f,g,h) \
 {\
@@ -983,25 +1548,6 @@ ITRANSFORM(dct, 32);
    { h, -g,  f, -e,  d, -c,  b, -a,},\
 }\
 
-#define DEFINE_DST7_P16_MATRIX(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
-{ \
-   { a,  b,  c,  d,  e,  f,  g,  h,  i,  j,  k,  l,  m,  n,  o,  p}, \
-   { c,  f,  i,  l,  o,  o,  l,  i,  f,  c,  0, -c, -f, -i, -l, -o}, \
-   { e,  j,  o,  m,  h,  c, -b, -g, -l, -p, -k, -f, -a,  d,  i,  n}, \
-   { g,  n,  l,  e, -b, -i, -p, -j, -c,  d,  k,  o,  h,  a, -f, -m}, \
-   { i,  o,  f, -c, -l, -l, -c,  f,  o,  i,  0, -i, -o, -f,  c,  l}, \
-   { k,  k,  0, -k, -k,  0,  k,  k,  0, -k, -k,  0,  k,  k,  0, -k}, \
-   { m,  g, -f, -n, -a,  l,  h, -e, -o, -b,  k,  i, -d, -p, -c,  j}, \
-   { o,  c, -l, -f,  i,  i, -f, -l,  c,  o,  0, -o, -c,  l,  f, -i}, \
-   { p, -a, -o,  b,  n, -c, -m,  d,  l, -e, -k,  f,  j, -g, -i,  h}, \
-   { n, -e, -i,  j,  d, -o,  a,  m, -f, -h,  k,  c, -p,  b,  l, -g}, \
-   { l, -i, -c,  o, -f, -f,  o, -c, -i,  l,  0, -l,  i,  c, -o,  f}, \
-   { j, -m,  c,  g, -p,  f,  d, -n,  i,  a, -k,  l, -b, -h,  o, -e}, \
-   { h, -p,  i, -a, -g,  o, -j,  b,  f, -n,  k, -c, -e,  m, -l,  d}, \
-   { f, -l,  o, -i,  c,  c, -i,  o, -l,  f,  0, -f,  l, -o,  i, -c}, \
-   { d, -h,  l, -p,  m, -i,  e, -a, -c,  g, -k,  o, -n,  j, -f,  b}, \
-   { b, -d,  f, -h,  j, -l,  n, -p,  o, -m,  k, -i,  g, -e,  c, -a}, \
-}
 
 #define DEFINE_DST7_P16_MATRIX_T(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
 { \
@@ -1024,43 +1570,6 @@ ITRANSFORM(dct, 32);
 }
 
 
-
-#define DEFINE_DST7_P32_MATRIX(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,A,B,C,D,E,F) \
-{ \
-    {a,  b,  c,  d,  e,  f,  g,  h,  i,  j,  k,  l,  m,  n,  o,  p,  q,  r,  s,  t,  u,  v,  w,  x,  y,  z,  A,  B,  C,  D,  E,  F}, \
-    {c,  f,  i,  l,  o,  r,  u,  x,  A,  D,  F,  C,  z,  w,  t,  q,  n,  k,  h,  e,  b, -a, -d, -g, -j, -m, -p, -s, -v, -y, -B, -E}, \
-    {e,  j,  o,  t,  y,  D,  D,  y,  t,  o,  j,  e,  0, -e, -j, -o, -t, -y, -D, -D, -y, -t, -o, -j, -e,  0,  e,  j,  o,  t,  y,  D}, \
-    {g,  n,  u,  B,  D,  w,  p,  i,  b, -e, -l, -s, -z, -F, -y, -r, -k, -d,  c,  j,  q,  x,  E,  A,  t,  m,  f, -a, -h, -o, -v, -C}, \
-    {i,  r,  A,  C,  t,  k,  b, -g, -p, -y, -E, -v, -m, -d,  e,  n,  w,  F,  x,  o,  f, -c, -l, -u, -D, -z, -q, -h,  a,  j,  s,  B}, \
-    {k,  v,  F,  u,  j, -a, -l, -w, -E, -t, -i,  b,  m,  x,  D,  s,  h, -c, -n, -y, -C, -r, -g,  d,  o,  z,  B,  q,  f, -e, -p, -A}, \
-    {m,  z,  z,  m,  0, -m, -z, -z, -m,  0,  m,  z,  z,  m,  0, -m, -z, -z, -m,  0,  m,  z,  z,  m,  0, -m, -z, -z, -m,  0,  m,  z}, \
-    {o,  D,  t,  e, -j, -y, -y, -j,  e,  t,  D,  o,  0, -o, -D, -t, -e,  j,  y,  y,  j, -e, -t, -D, -o,  0,  o,  D,  t,  e, -j, -y}, \
-    {q,  E,  n, -c, -t, -B, -k,  f,  w,  y,  h, -i, -z, -v, -e,  l,  C,  s,  b, -o, -F, -p,  a,  r,  D,  m, -d, -u, -A, -j,  g,  x}, \
-    {s,  A,  h, -k, -D, -p,  c,  v,  x,  e, -n, -F, -m,  f,  y,  u,  b, -q, -C, -j,  i,  B,  r, -a, -t, -z, -g,  l,  E,  o, -d, -w}, \
-    {u,  w,  b, -s, -y, -d,  q,  A,  f, -o, -C, -h,  m,  E,  j, -k, -F, -l,  i,  D,  n, -g, -B, -p,  e,  z,  r, -c, -x, -t,  a,  v}, \
-    {w,  s, -d, -A, -o,  h,  E,  k, -l, -D, -g,  p,  z,  c, -t, -v,  a,  x,  r, -e, -B, -n,  i,  F,  j, -m, -C, -f,  q,  y,  b, -u}, \
-    {y,  o, -j, -D, -e,  t,  t, -e, -D, -j,  o,  y,  0, -y, -o,  j,  D,  e, -t, -t,  e,  D,  j, -o, -y,  0,  y,  o, -j, -D, -e,  t}, \
-    {A,  k, -p, -v,  e,  F,  f, -u, -q,  j,  B,  a, -z, -l,  o,  w, -d, -E, -g,  t,  r, -i, -C, -b,  y,  m, -n, -x,  c,  D,  h, -s}, \
-    {C,  g, -v, -n,  o,  u, -h, -B,  a,  D,  f, -w, -m,  p,  t, -i, -A,  b,  E,  e, -x, -l,  q,  s, -j, -z,  c,  F,  d, -y, -k,  r}, \
-    {E,  c, -B, -f,  y,  i, -v, -l,  s,  o, -p, -r,  m,  u, -j, -x,  g,  A, -d, -D,  a,  F,  b, -C, -e,  z,  h, -w, -k,  t,  n, -q}, \
-    {F, -a, -E,  b,  D, -c, -C,  d,  B, -e, -A,  f,  z, -g, -y,  h,  x, -i, -w,  j,  v, -k, -u,  l,  t, -m, -s,  n,  r, -o, -q,  p}, \
-    {D, -e, -y,  j,  t, -o, -o,  t,  j, -y, -e,  D,  0, -D,  e,  y, -j, -t,  o,  o, -t, -j,  y,  e, -D,  0,  D, -e, -y,  j,  t, -o}, \
-    {B, -i, -s,  r,  j, -A, -a,  C, -h, -t,  q,  k, -z, -b,  D, -g, -u,  p,  l, -y, -c,  E, -f, -v,  o,  m, -x, -d,  F, -e, -w,  n}, \
-    {z, -m, -m,  z,  0, -z,  m,  m, -z,  0,  z, -m, -m,  z,  0, -z,  m,  m, -z,  0,  z, -m, -m,  z,  0, -z,  m,  m, -z,  0,  z, -m}, \
-    {x, -q, -g,  E, -j, -n,  A, -c, -u,  t,  d, -B,  m,  k, -D,  f,  r, -w, -a,  y, -p, -h,  F, -i, -o,  z, -b, -v,  s,  e, -C,  l}, \
-    {v, -u, -a,  w, -t, -b,  x, -s, -c,  y, -r, -d,  z, -q, -e,  A, -p, -f,  B, -o, -g,  C, -n, -h,  D, -m, -i,  E, -l, -j,  F, -k}, \
-    {t, -y,  e,  o, -D,  j,  j, -D,  o,  e, -y,  t,  0, -t,  y, -e, -o,  D, -j, -j,  D, -o, -e,  y, -t,  0,  t, -y,  e,  o, -D,  j}, \
-    {r, -C,  k,  g, -y,  v, -d, -n,  F, -o, -c,  u, -z,  h,  j, -B,  s, -a, -q,  D, -l, -f,  x, -w,  e,  m, -E,  p,  b, -t,  A, -i}, \
-    {p, -F,  q, -a, -o,  E, -r,  b,  n, -D,  s, -c, -m,  C, -t,  d,  l, -B,  u, -e, -k,  A, -v,  f,  j, -z,  w, -g, -i,  y, -x,  h}, \
-    {n, -B,  w, -i, -e,  s, -F,  r, -d, -j,  x, -A,  m,  a, -o,  C, -v,  h,  f, -t,  E, -q,  c,  k, -y,  z, -l, -b,  p, -D,  u, -g}, \
-    {l, -x,  C, -q,  e,  g, -s,  E, -v,  j,  b, -n,  z, -A,  o, -c, -i,  u, -F,  t, -h, -d,  p, -B,  y, -m,  a,  k, -w,  D, -r,  f}, \
-    {j, -t,  D, -y,  o, -e, -e,  o, -y,  D, -t,  j,  0, -j,  t, -D,  y, -o,  e,  e, -o,  y, -D,  t, -j,  0,  j, -t,  D, -y,  o, -e}, \
-    {h, -p,  x, -F,  y, -q,  i, -a, -g,  o, -w,  E, -z,  r, -j,  b,  f, -n,  v, -D,  A, -s,  k, -c, -e,  m, -u,  C, -B,  t, -l,  d}, \
-    {f, -l,  r, -x,  D, -C,  w, -q,  k, -e, -a,  g, -m,  s, -y,  E, -B,  v, -p,  j, -d, -b,  h, -n,  t, -z,  F, -A,  u, -o,  i, -c}, \
-    {d, -h,  l, -p,  t, -x,  B, -F,  C, -y,  u, -q,  m, -i,  e, -a, -c,  g, -k,  o, -s,  w, -A,  E, -D,  z, -v,  r, -n,  j, -f,  b}, \
-    {b, -d,  f, -h,  j, -l,  n, -p,  r, -t,  v, -x,  z, -B,  D, -F,  E, -C,  A, -y,  w, -u,  s, -q,  o, -m,  k, -i,  g, -e,  c, -a}, \
-}
-
 #define DEFINE_DST7_P32_MATRIX_T(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,A,B,C,D,E,F) \
 { \
     {a,  c,  e,  g,  i,  k,  m,  o,  q,  s,  u,  w,  y,  A,  C,  E,  F,  D,  B,  z,  x,  v,  t,  r,  p,  n,  l,  j,  h,  f,  d,  b,},\
@@ -1097,85 +1606,6 @@ ITRANSFORM(dct, 32);
     {F, -E,  D, -C,  B, -A,  z, -y,  x, -w,  v, -u,  t, -s,  r, -q,  p, -o,  n, -m,  l, -k,  j, -i,  h, -g,  f, -e,  d, -c,  b, -a,},\
 }
 
-// DCT-8
-#define DEFINE_DCT8_P4_MATRIX(a,b,c,d) \
-{ \
-    {a,  b,  c,  d}, \
-    {b,  0, -b, -b}, \
-    {c, -b, -d,  a}, \
-    {d, -b,  a, -c}, \
-}
-
-#define DEFINE_DCT8_P8_MATRIX(a,b,c,d,e,f,g,h) \
-{ \
-    {a,  b,  c,  d,  e,  f,  g,  h}, \
-    {b,  e,  h, -g, -d, -a, -c, -f}, \
-    {c,  h, -e, -a, -f,  g,  b,  d}, \
-    {d, -g, -a, -h,  c,  e, -f, -b}, \
-    {e, -d, -f,  c,  g, -b, -h,  a}, \
-    {f, -a,  g,  e, -b,  h,  d, -c}, \
-    {g, -c,  b, -f, -h,  d, -a,  e}, \
-    {h, -f,  d, -b,  a, -c,  e, -g}, \
-}
-
-#define DEFINE_DCT8_P16_MATRIX(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
-{ \
-    {a,  b,  c,  d,  e,  f,  g,  h,  i,  j,  k,  l,  m,  n,  o,  p}, \
-    {b,  e,  h,  k,  n,  0, -n, -k, -h, -e, -b, -b, -e, -h, -k, -n}, \
-    {c,  h,  m, -p, -k, -f, -a, -e, -j, -o,  n,  i,  d,  b,  g,  l}, \
-    {d,  k, -p, -i, -b, -f, -m,  n,  g,  a,  h,  o, -l, -e, -c, -j}, \
-    {e,  n, -k, -b, -h,  0,  h,  b,  k, -n, -e, -e, -n,  k,  b,  h}, \
-    {f,  0, -f, -f,  0,  f,  f,  0, -f, -f,  0,  f,  f,  0, -f, -f}, \
-    {g, -n, -a, -m,  h,  f, -o, -b, -l,  i,  e, -p, -c, -k,  j,  d}, \
-    {h, -k, -e,  n,  b,  0, -b, -n,  e,  k, -h, -h,  k,  e, -n, -b}, \
-    {i, -h, -j,  g,  k, -f, -l,  e,  m, -d, -n,  c,  o, -b, -p,  a}, \
-    {j, -e, -o,  a, -n, -f,  i,  k, -d, -p,  b, -m, -g,  h,  l, -c}, \
-    {k, -b,  n,  h, -e,  0,  e, -h, -n,  b, -k, -k,  b, -n, -h,  e}, \
-    {l, -b,  i,  o, -e,  f, -p, -h,  c, -m, -k,  a, -j, -n,  d, -g}, \
-    {m, -e,  d, -l, -n,  f, -c,  k,  o, -g,  b, -j, -p,  h, -a,  i}, \
-    {n, -h,  b, -e,  k,  0, -k,  e, -b,  h, -n, -n,  h, -b,  e, -k}, \
-    {o, -k,  g, -c,  b, -f,  j, -n, -p,  l, -h,  d, -a,  e, -i,  m}, \
-    {p, -n,  l, -j,  h, -f,  d, -b,  a, -c,  e, -g,  i, -k,  m, -o}, \
-}
-
-
-#define DEFINE_DCT8_P32_MATRIX(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,A,B,C,D,E,F) \
-{ \
-    {a,  b,  c,  d,  e,  f,  g,  h,  i,  j,  k,  l,  m,  n,  o,  p,  q,  r,  s,  t,  u,  v,  w,  x,  y,  z,  A,  B,  C,  D,  E,  F}, \
-    {b,  e,  h,  k,  n,  q,  t,  w,  z,  C,  F, -E, -B, -y, -v, -s, -p, -m, -j, -g, -d, -a, -c, -f, -i, -l, -o, -r, -u, -x, -A, -D}, \
-    {c,  h,  m,  r,  w,  B,  0, -B, -w, -r, -m, -h, -c, -c, -h, -m, -r, -w, -B,  0,  B,  w,  r,  m,  h,  c,  c,  h,  m,  r,  w,  B}, \
-    {d,  k,  r,  y,  F, -A, -t, -m, -f, -b, -i, -p, -w, -D,  C,  v,  o,  h,  a,  g,  n,  u,  B, -E, -x, -q, -j, -c, -e, -l, -s, -z}, \
-    {e,  n,  w,  F, -y, -p, -g, -c, -l, -u, -D,  A,  r,  i,  a,  j,  s,  B, -C, -t, -k, -b, -h, -q, -z,  E,  v,  m,  d,  f,  o,  x}, \
-    {f,  q,  B, -A, -p, -e, -g, -r, -C,  z,  o,  d,  h,  s,  D, -y, -n, -c, -i, -t, -E,  x,  m,  b,  j,  u,  F, -w, -l, -a, -k, -v}, \
-    {g,  t,  0, -t, -g, -g, -t,  0,  t,  g,  g,  t,  0, -t, -g, -g, -t,  0,  t,  g,  g,  t,  0, -t, -g, -g, -t,  0,  t,  g,  g,  t}, \
-    {h,  w, -B, -m, -c, -r,  0,  r,  c,  m,  B, -w, -h, -h, -w,  B,  m,  c,  r,  0, -r, -c, -m, -B,  w,  h,  h,  w, -B, -m, -c, -r}, \
-    {i,  z, -w, -f, -l, -C,  t,  c,  o,  F, -q, -a, -r,  E,  n,  d,  u, -B, -k, -g, -x,  y,  h,  j,  A, -v, -e, -m, -D,  s,  b,  p}, \
-    {j,  C, -r, -b, -u,  z,  g,  m,  F, -o, -e, -x,  w,  d,  p, -E, -l, -h, -A,  t,  a,  s, -B, -i, -k, -D,  q,  c,  v, -y, -f, -n}, \
-    {k,  F, -m, -i, -D,  o,  g,  B, -q, -e, -z,  s,  c,  x, -u, -a, -v,  w,  b,  t, -y, -d, -r,  A,  f,  p, -C, -h, -n,  E,  j,  l}, \
-    {l, -E, -h, -p,  A,  d,  t, -w, -a, -x,  s,  e,  B, -o, -i, -F,  k,  m, -D, -g, -q,  z,  c,  u, -v, -b, -y,  r,  f,  C, -n, -j}, \
-    {m, -B, -c, -w,  r,  h,  0, -h, -r,  w,  c,  B, -m, -m,  B,  c,  w, -r, -h,  0,  h,  r, -w, -c, -B,  m,  m, -B, -c, -w,  r,  h}, \
-    {n, -y, -c, -D,  i,  s, -t, -h,  E,  d,  x, -o, -m,  z,  b,  C, -j, -r,  u,  g, -F, -e, -w,  p,  l, -A, -a, -B,  k,  q, -v, -f}, \
-    {o, -v, -h,  C,  a,  D, -g, -w,  n,  p, -u, -i,  B,  b,  E, -f, -x,  m,  q, -t, -j,  A,  c,  F, -e, -y,  l,  r, -s, -k,  z,  d}, \
-    {p, -s, -m,  v,  j, -y, -g,  B,  d, -E, -a, -F,  c,  C, -f, -z,  i,  w, -l, -t,  o,  q, -r, -n,  u,  k, -x, -h,  A,  e, -D, -b}, \
-    {q, -p, -r,  o,  s, -n, -t,  m,  u, -l, -v,  k,  w, -j, -x,  i,  y, -h, -z,  g,  A, -f, -B,  e,  C, -d, -D,  c,  E, -b, -F,  a}, \
-    {r, -m, -w,  h,  B, -c,  0,  c, -B, -h,  w,  m, -r, -r,  m,  w, -h, -B,  c,  0, -c,  B,  h, -w, -m,  r,  r, -m, -w,  h,  B, -c}, \
-    {s, -j, -B,  a, -C, -i,  t,  r, -k, -A,  b, -D, -h,  u,  q, -l, -z,  c, -E, -g,  v,  p, -m, -y,  d, -F, -f,  w,  o, -n, -x,  e}, \
-    {t, -g,  0,  g, -t, -t,  g,  0, -g,  t,  t, -g,  0,  g, -t, -t,  g,  0, -g,  t,  t, -g,  0,  g, -t, -t,  g,  0, -g,  t,  t, -g}, \
-    {u, -d,  B,  n, -k, -E,  g, -r, -x,  a, -y, -q,  h, -F, -j,  o,  A, -c,  v,  t, -e,  C,  m, -l, -D,  f, -s, -w,  b, -z, -p,  i}, \
-    {v, -a,  w,  u, -b,  x,  t, -c,  y,  s, -d,  z,  r, -e,  A,  q, -f,  B,  p, -g,  C,  o, -h,  D,  n, -i,  E,  m, -j,  F,  l, -k}, \
-    {w, -c,  r,  B, -h,  m,  0, -m,  h, -B, -r,  c, -w, -w,  c, -r, -B,  h, -m,  0,  m, -h,  B,  r, -c,  w,  w, -c,  r,  B, -h,  m}, \
-    {x, -f,  m, -E, -q,  b, -t, -B,  j, -i,  A,  u, -c,  p,  F, -n,  e, -w, -y,  g, -l,  D,  r, -a,  s,  C, -k,  h, -z, -v,  d, -o}, \
-    {y, -i,  h, -x, -z,  j, -g,  w,  A, -k,  f, -v, -B,  l, -e,  u,  C, -m,  d, -t, -D,  n, -c,  s,  E, -o,  b, -r, -F,  p, -a,  q}, \
-    {z, -l,  c, -q,  E,  u, -g,  h, -v, -D,  p, -b,  m, -A, -y,  k, -d,  r, -F, -t,  f, -i,  w,  C, -o,  a, -n,  B,  x, -j,  e, -s}, \
-    {A, -o,  c, -j,  v,  F, -t,  h, -e,  q, -C, -y,  m, -a,  l, -x, -D,  r, -f,  g, -s,  E,  w, -k,  b, -n,  z,  B, -p,  d, -i,  u}, \
-    {B, -r,  h, -c,  m, -w,  0,  w, -m,  c, -h,  r, -B, -B,  r, -h,  c, -m,  w,  0, -w,  m, -c,  h, -r,  B,  B, -r,  h, -c,  m, -w}, \
-    {C, -u,  m, -e,  d, -l,  t, -B, -D,  v, -n,  f, -c,  k, -s,  A,  E, -w,  o, -g,  b, -j,  r, -z, -F,  x, -p,  h, -a,  i, -q,  y}, \
-    {D, -x,  r, -l,  f, -a,  g, -m,  s, -y,  E,  C, -w,  q, -k,  e, -b,  h, -n,  t, -z,  F,  B, -v,  p, -j,  d, -c,  i, -o,  u, -A}, \
-    {E, -A,  w, -s,  o, -k,  g, -c,  b, -f,  j, -n,  r, -v,  z, -D, -F,  B, -x,  t, -p,  l, -h,  d, -a,  e, -i,  m, -q,  u, -y,  C}, \
-    {F, -D,  B, -z,  x, -v,  t, -r,  p, -n,  l, -j,  h, -f,  d, -b,  a, -c,  e, -g,  i, -k,  m, -o,  q, -s,  u, -w,  y, -A,  C, -E}, \
-}
-
-
 // DST-7
 ALIGNED(64) const int16_t uvg_g_dst7_4[4][4] = DEFINE_DST7_P4_MATRIX(29, 55, 74, 84);
 ALIGNED(64) const int16_t uvg_g_dst7_8[8][8] = DEFINE_DST7_P8_MATRIX(17, 32, 46, 60, 71, 78, 85, 86);
@@ -1576,6 +2006,6044 @@ static tr_func* idct_table[5] = {
   mts_idct_4x4_avx2, mts_idct_8x8_avx2, mts_idct_16x16_avx2, mts_idct_32x32_avx2, NULL/*fastInverseDCT2_B64*/
 };
 
+typedef void (dct_full_pass)(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver);
+
+
+// **********************************************
+// New tailored functions for each size combination
+// **********************************************
+
+static void fast_forward_tr_2xN_avx2_hor(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = (shift > 0) ? (1 << (shift - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const __m256i debias = _mm256_set1_epi32(add);
+  __m256i    v_coeff_0 = _mm256_load_si256((__m256i*)coeff);
+  __m256i    v_coeff_1 = _mm256_load_si256((__m256i*)(coeff + 16));
+  __m256i* v_dst_ptr = dst;
+
+  const int reduced_line = line - skip_line;
+  // Handle 8 lines at a time (16 samples, 2 samples per line)
+  for (int j = 0; j < reduced_line; j += 8) {
+    // src vector: [00 01 02 03 04 05 06 07|08 09 10 11 12 13 14 15]
+    __m256i     v_src = _mm256_load_si256((const __m256i*) src);
+
+    // Multiply with a and add together all adjacent elements
+    // even vector: [a00+a01 a02+a03 a04+a05 a06+a07|a08+a09 a10+a11 a12+a13 a14+a15]
+    __m256i    v_even = _mm256_madd_epi16(v_src, v_coeff_0);
+    // odd vector : [a00-a01 a02-a03 a04-a05 a06-a07|a08-a09 a10-a11 a12-a13 a14-a15]
+    __m256i     v_odd = _mm256_madd_epi16(v_src, v_coeff_1);
+
+    __m256i v_trunc_0 = truncate_avx2(v_even, debias, shift);
+    __m256i v_trunc_1 = truncate_avx2(v_odd, debias, shift);
+
+    v_dst_ptr[0] = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+    
+    src += 16;
+    v_dst_ptr++;
+  }
+}
+
+void fast_forward_tr_2x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 2;
+  const int height = 8;
+
+  const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1;
+  const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
+  const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8;
+  const int32_t shift_2nd = log2_height_minus1 + 7;
+
+  const int16_t* hor_coeff = ff_dct2_2xN_coeff_hor;
+  const int16_t* ver_coeff = ff_dct2_2x8_coeff_ver;
+  if (ver == DST7) {
+    ver_coeff = ff_dst7_2x8_coeff_ver;
+  }
+  // No coeffs for DCT8 and DST7 transforms since they do not exist for this block size
+
+  __m256i v_hor_pass_out;
+  fast_forward_tr_2xN_avx2_hor(src, &v_hor_pass_out, hor_coeff, shift_1st, height, 0, 0);
+
+  // Vertical pass
+  const int32_t    add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)ver_coeff;
+
+  // Got data for only 1 vector
+  // const __m256i v_shuffle = _mm256_load_si256((const __m256i*)ff_dct2_2x8_shuffle_ver);
+  const __m256i v_src_raw = v_hor_pass_out;
+  // __m256i           v_src = _mm256_shuffle_epi8(v_src_raw, v_shuffle);
+  __m256i           v_src = _mm256_permute4x64_epi64(v_src_raw, _MM_SHUFFLE(3, 1, 2, 0));
+
+  __m256i v_madd[8];
+  for (int i = 0; i < 8; ++i) {
+    v_madd[i] = _mm256_madd_epi16(v_src, v_coeff[i]);
+  }
+  __m256i v_hadd_0[4];
+  for (int i = 0; i < 4; ++i) {
+    const int offset = i * 2;
+    v_hadd_0[i] = _mm256_hadd_epi32(v_madd[offset], v_madd[offset + 1]);
+  }
+
+  __m256i v_trunc[2];
+  for (int i = 0; i < 2; ++i) {
+    const int offset = i * 2;
+    v_trunc[i] = truncate_avx2(_mm256_hadd_epi32(v_hadd_0[offset], v_hadd_0[offset + 1]), debias, shift_2nd);
+  }
+
+  __m256i v_result = _mm256_packs_epi32(v_trunc[0], v_trunc[1]);
+  const __m256i v_res_shfl = _mm256_load_si256((const __m256i*)ff_dct2_2x8_result_shuffle_ver);
+  // Shuffle values to correct order
+  v_result = _mm256_permute4x64_epi64(v_result, _MM_SHUFFLE(3, 1, 2, 0));
+  v_result = _mm256_shuffle_epi32(v_result, _MM_SHUFFLE(3, 1, 2, 0));
+  v_result = _mm256_shuffle_epi8(v_result, v_res_shfl);
+  _mm256_store_si256((__m256i*)dst, v_result);
+}
+
+
+static void fast_inverse_tr_2x8_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+  const __m256i v_shuffle = _mm256_load_si256((const __m256i*)fi_tr_8x2_shuffle_hor);
+
+  const __m256i v_src_raw = _mm256_load_si256((const __m256i*)src);
+
+  __m256i v_src = _mm256_shuffle_epi8(v_src_raw, v_shuffle);
+  v_src = _mm256_permute4x64_epi64(v_src, _MM_SHUFFLE(3, 1, 2, 0));
+
+  __m256i v_madd_0 = _mm256_madd_epi16(v_src, v_coeff[0]);
+  __m256i v_madd_1 = _mm256_madd_epi16(v_src, v_coeff[1]);
+  __m256i v_madd_2 = _mm256_madd_epi16(v_src, v_coeff[2]);
+  __m256i v_madd_3 = _mm256_madd_epi16(v_src, v_coeff[3]);
+  __m256i v_madd_4 = _mm256_madd_epi16(v_src, v_coeff[4]);
+  __m256i v_madd_5 = _mm256_madd_epi16(v_src, v_coeff[5]);
+  __m256i v_madd_6 = _mm256_madd_epi16(v_src, v_coeff[6]);
+  __m256i v_madd_7 = _mm256_madd_epi16(v_src, v_coeff[7]);
+
+  __m256i v_hadd_00 = _mm256_hadd_epi32(v_madd_0, v_madd_1);
+  __m256i v_hadd_01 = _mm256_hadd_epi32(v_madd_2, v_madd_3);
+  __m256i v_hadd_02 = _mm256_hadd_epi32(v_madd_4, v_madd_5);
+  __m256i v_hadd_03 = _mm256_hadd_epi32(v_madd_6, v_madd_7);
+
+  __m256i v_hadd_10 = _mm256_hadd_epi32(v_hadd_00, v_hadd_01);
+  __m256i v_hadd_11 = _mm256_hadd_epi32(v_hadd_02, v_hadd_03);
+
+  __m256i v_trunc_0 = truncate_avx2(v_hadd_10, debias, shift);
+  __m256i v_trunc_1 = truncate_avx2(v_hadd_11, debias, shift);
+
+  dst[0] = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+}
+
+static void fast_inverse_tr_2x8_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i v_coeff_0 = _mm256_load_si256((const __m256i*) & coeff[0]);
+  const __m256i v_coeff_1 = _mm256_load_si256((const __m256i*) & coeff[16]);
+  const __m256i v_shuffle = _mm256_load_si256((const __m256i*)fi_tr_8x2_shuffle_ver);
+  const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)fi_tr_8x2_res_shuffle_ver);
+
+  __m256i v_src = _mm256_permute4x64_epi64(src[0], _MM_SHUFFLE(3, 1, 2, 0));
+  v_src = _mm256_shuffle_epi8(v_src, v_shuffle);
+
+  __m256i v_madd_0 = _mm256_madd_epi16(v_src, v_coeff_0);
+  __m256i v_madd_1 = _mm256_madd_epi16(v_src, v_coeff_1);
+
+  __m256i v_trunc_0 = truncate_avx2(v_madd_0, debias, shift);
+  __m256i v_trunc_1 = truncate_avx2(v_madd_1, debias, shift);
+
+  __m256i v_result = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+  v_result = _mm256_shuffle_epi8(v_result, v_res_shuffle);
+
+  _mm256_store_si256((__m256i*)dst, v_result);
+}
+
+void fast_inverse_tr_2x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 2;
+  const int height = 8;
+
+  int skip_width = 0;
+  int skip_height = 0;
+
+  const int32_t shift_1st = INVERSE_SHIFT_1ST;
+  const int32_t shift_2nd = INVERSE_SHIFT_2ND;
+
+  const int16_t* ver_coeff = fi_dct2_8x2_coeff_hor; // TODO: rename
+  const int16_t* hor_coeff = fi_dct2_8x2_coeff_ver; // rename
+  // No coeffs for DCT8 and DST7 transforms since they do not exist for this block size
+
+  __m256i v_ver_pass_out;
+  fast_inverse_tr_2x8_avx2_ver(src, &v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height);
+  
+  fast_inverse_tr_2x8_avx2_hor(&v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width);
+}
+
+
+void fast_forward_tr_2x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 2;
+  const int height = 16;
+
+  const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1;
+  const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
+  const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8;
+  const int32_t shift_2nd = log2_height_minus1 + 7;
+
+  const int16_t* hor_coeff = ff_dct2_2xN_coeff_hor;
+  const int16_t* ver_coeff = uvg_g_dct_16;
+  if (ver == DST7) {
+    ver_coeff = uvg_g_dst7_16;
+  }
+  const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)ff_dct2_2x16_ver_result_shuffle);
+  // No coeffs for DCT8 and DST7 transforms since they do not exist for this block size
+
+  __m256i v_hor_pass_out[2];
+  fast_forward_tr_2xN_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, 0);
+
+  // Vertical pass
+  const int32_t    add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  // Permute hor pass output to correct order
+  __m256i v_tmp_0 = _mm256_permute4x64_epi64(v_hor_pass_out[0], _MM_SHUFFLE(3, 1, 2, 0));
+  __m256i v_tmp_1 = _mm256_permute4x64_epi64(v_hor_pass_out[1], _MM_SHUFFLE(3, 1, 2, 0));
+  __m256i v_src_0 = _mm256_permute2x128_si256(v_tmp_0, v_tmp_1, 0x20);
+  __m256i v_src_1 = _mm256_permute2x128_si256(v_tmp_0, v_tmp_1, 0x31);
+
+  const __m256i* v_coeff_ptr = (const __m256i*)ver_coeff;
+
+  __m256i v_madd[2][16];
+  for (int i = 0; i < 16; ++i) {
+    v_madd[0][i] = _mm256_madd_epi16(v_src_0, v_coeff_ptr[i]);
+    v_madd[1][i] = _mm256_madd_epi16(v_src_1, v_coeff_ptr[i]);
+  }
+
+  __m256i v_hadd_0[2][8];
+  for (int dst = 0, src = 0; dst < 8; ++dst, src += 2) {
+    v_hadd_0[0][dst] = _mm256_hadd_epi32(v_madd[0][src], v_madd[0][src + 1]);
+    v_hadd_0[1][dst] = _mm256_hadd_epi32(v_madd[1][src], v_madd[1][src + 1]);
+  }
+
+  __m256i v_hadd_1[2][4];
+  for (int dst = 0, src = 0; dst < 4; ++dst, src += 2) {
+    v_hadd_1[0][dst] = _mm256_hadd_epi32(v_hadd_0[0][src], v_hadd_0[0][src + 1]);
+    v_hadd_1[1][dst] = _mm256_hadd_epi32(v_hadd_0[1][src], v_hadd_0[1][src + 1]);
+  }
+
+  __m256i v_tmp_00 = _mm256_permute2x128_si256(v_hadd_1[0][0], v_hadd_1[0][1], 0x20);
+  __m256i v_tmp_01 = _mm256_permute2x128_si256(v_hadd_1[0][0], v_hadd_1[0][1], 0x31);
+  __m256i v_tmp_02 = _mm256_permute2x128_si256(v_hadd_1[0][2], v_hadd_1[0][3], 0x20);
+  __m256i v_tmp_03 = _mm256_permute2x128_si256(v_hadd_1[0][2], v_hadd_1[0][3], 0x31);
+
+  __m256i v_tmp_10 = _mm256_permute2x128_si256(v_hadd_1[1][0], v_hadd_1[1][1], 0x20);
+  __m256i v_tmp_11 = _mm256_permute2x128_si256(v_hadd_1[1][0], v_hadd_1[1][1], 0x31);
+  __m256i v_tmp_12 = _mm256_permute2x128_si256(v_hadd_1[1][2], v_hadd_1[1][3], 0x20);
+  __m256i v_tmp_13 = _mm256_permute2x128_si256(v_hadd_1[1][2], v_hadd_1[1][3], 0x31);
+
+  __m256i v_trunc_00 = truncate_avx2((_mm256_add_epi32(v_tmp_00, v_tmp_01)), debias, shift_2nd);
+  __m256i v_trunc_01 = truncate_avx2((_mm256_add_epi32(v_tmp_02, v_tmp_03)), debias, shift_2nd);
+
+  __m256i v_trunc_10 = truncate_avx2((_mm256_add_epi32(v_tmp_10, v_tmp_11)), debias, shift_2nd);
+  __m256i v_trunc_11 = truncate_avx2((_mm256_add_epi32(v_tmp_12, v_tmp_13)), debias, shift_2nd);
+
+  __m256i v_result_0 = _mm256_packs_epi32(v_trunc_00, v_trunc_10);
+  __m256i v_result_1 = _mm256_packs_epi32(v_trunc_01, v_trunc_11);
+
+  v_result_0 = _mm256_shuffle_epi8(v_result_0, v_res_shuffle);
+  v_result_1 = _mm256_shuffle_epi8(v_result_1, v_res_shuffle);
+
+  _mm256_store_si256((__m256i*)&dst[0], v_result_0);
+  _mm256_store_si256((__m256i*)&dst[16], v_result_1);
+}
+
+
+static void fast_inverse_tr_2x16_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+  const __m256i* v_src_raw = (const __m256i*)src;
+  const __m256i v_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0246);
+
+  __m256i v_src_0 = _mm256_shuffle_epi8(v_src_raw[0], v_shuffle);
+  __m256i v_src_1 = _mm256_shuffle_epi8(v_src_raw[1], v_shuffle);
+
+  v_src_0 = _mm256_permute4x64_epi64(v_src_0, _MM_SHUFFLE(3, 1, 2, 0));
+  v_src_1 = _mm256_permute4x64_epi64(v_src_1, _MM_SHUFFLE(3, 1, 2, 0));
+
+  __m256i v_madd_0[16];
+  __m256i v_madd_1[16];
+  for (int c = 0; c < 16; ++c) {
+    v_madd_0[c] = _mm256_madd_epi16(v_src_0, v_coeff[0]);
+    v_madd_1[c] = _mm256_madd_epi16(v_src_1, v_coeff[1]);
+    v_coeff += 2;
+  }
+
+  __m256i v_add[16];
+  for (int i = 0; i < 16; ++i) {
+    v_add[i] = _mm256_add_epi32(v_madd_0[i], v_madd_1[i]);
+  }
+
+  __m256i v_hadd_0[8];
+  for (int d = 0, s = 0; d < 8; ++d, s += 2) {
+    v_hadd_0[d] = _mm256_hadd_epi32(v_add[s + 0], v_add[s + 1]);
+  }
+
+  __m256i v_hadd_1[4];
+  for (int d = 0, s = 0; d < 4; ++d, s += 2) {
+    v_hadd_1[d] = _mm256_hadd_epi32(v_hadd_0[s + 0], v_hadd_0[s + 1]);
+  }
+
+  __m256i v_trunc[4];
+  for (int i = 0; i < 4; ++i) {
+    v_trunc[i] = truncate_avx2(v_hadd_1[i], debias, shift);
+  }
+
+  __m256i v_tmp0 = _mm256_packs_epi32(v_trunc[0], v_trunc[1]);
+  __m256i v_tmp1 = _mm256_packs_epi32(v_trunc[2], v_trunc[3]);
+
+  dst[0] = _mm256_permute2x128_si256(v_tmp0, v_tmp1, 0x20);
+  dst[1] = _mm256_permute2x128_si256(v_tmp0, v_tmp1, 0x31);
+}
+
+static void fast_inverse_tr_2x16_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+  const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0415);
+
+  __m256i v_src_lo = _mm256_unpacklo_epi16(src[0], src[1]);
+  __m256i v_src_hi = _mm256_unpackhi_epi16(src[0], src[1]);
+
+  __m256i v_madd_lo_0 = _mm256_madd_epi16(v_src_lo, v_coeff[0]);
+  __m256i v_madd_lo_1 = _mm256_madd_epi16(v_src_lo, v_coeff[1]);
+
+  __m256i v_madd_hi_0 = _mm256_madd_epi16(v_src_hi, v_coeff[0]);
+  __m256i v_madd_hi_1 = _mm256_madd_epi16(v_src_hi, v_coeff[1]);
+
+  __m256i v_trunc_0 = truncate_avx2(v_madd_lo_0, debias, shift);
+  __m256i v_trunc_1 = truncate_avx2(v_madd_lo_1, debias, shift);
+  __m256i v_trunc_2 = truncate_avx2(v_madd_hi_0, debias, shift);
+  __m256i v_trunc_3 = truncate_avx2(v_madd_hi_1, debias, shift);
+
+  __m256i v_tmp0 = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+  __m256i v_tmp1 = _mm256_packs_epi32(v_trunc_2, v_trunc_3);
+
+  v_tmp0 = _mm256_shuffle_epi8(v_tmp0, v_res_shuffle);
+  v_tmp1 = _mm256_shuffle_epi8(v_tmp1, v_res_shuffle);
+
+  __m256i v_result_0 = _mm256_permute2x128_si256(v_tmp0, v_tmp1, 0x20);
+  __m256i v_result_1 = _mm256_permute2x128_si256(v_tmp0, v_tmp1, 0x31);
+
+  _mm256_store_si256((__m256i*) & dst[0], v_result_0);
+  _mm256_store_si256((__m256i*) & dst[16], v_result_1);
+}
+
+void fast_inverse_tr_2x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 2;
+  const int height = 16;
+
+  int skip_width = 0;
+  int skip_height = 0;
+
+  const int32_t shift_1st = INVERSE_SHIFT_1ST;
+  const int32_t shift_2nd = INVERSE_SHIFT_2ND;
+
+  const int16_t* ver_coeff = fi_dct2_16x2_coeff_hor; // TODO: rename
+  const int16_t* hor_coeff = fi_dct2_16x2_coeff_ver; // rename
+  // No coeffs for DCT8 and DST7 transforms since they do not exist for this block size
+
+  __m256i v_ver_pass_out[2];
+  fast_inverse_tr_2x16_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height);
+
+  fast_inverse_tr_2x16_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width);
+}
+
+
+void fast_forward_tr_2x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 2;
+  const int height = 32;
+
+  int skip_width = 0;
+  int skip_height = (ver != DCT2 && height == 32) ? 16 : (height > 32 ? height - 32 : 0);
+
+  const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1;
+  const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
+  const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8;
+  const int32_t shift_2nd = log2_height_minus1 + 7;
+
+  const int16_t* hor_coeff = ff_dct2_2xN_coeff_hor;
+  const int16_t* ver_coeff = uvg_g_dct_32;
+  // For result shuffling, can use existing shuffle vector
+  const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)ff_dct2_2x16_ver_result_shuffle);
+  // No coeffs for DCT8 and DST7 transforms since they do not exist for this block size
+
+  ALIGNED(32) int16_t v_hor_pass_out[2*32];
+  fast_forward_tr_2xN_avx2_hor(src, (__m256i *)v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);
+
+  __m256i temp_out[4];
+  // Vertical pass
+  const int32_t    add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  for (int j = 0; j < 2; ++j) {
+    __m256i res_0 = _mm256_setzero_si256();
+    __m256i res_1 = _mm256_setzero_si256();
+    __m256i res_2 = _mm256_setzero_si256();
+    __m256i res_3 = _mm256_setzero_si256();
+    const int16_t* coeff_start = ff_dct2_32x32_coeff_ver;
+    const int32_t* temp_source = (int32_t*)(v_hor_pass_out + j * 4);
+    for (int i = 0; i < 16; ++i) {
+
+      __m256i v_src = _mm256_set1_epi32(*temp_source);
+      temp_source += i & 1 ? 3 : 1;
+      __m256i v_coeff_0 = _mm256_load_si256((__m256i*) coeff_start);
+      coeff_start += 16;
+      __m256i v_coeff_1 = _mm256_load_si256((__m256i*) coeff_start);
+      coeff_start += 16;
+      __m256i v_coeff_2 = _mm256_load_si256((__m256i*) coeff_start);
+      coeff_start += 16;
+      __m256i v_coeff_3 = _mm256_load_si256((__m256i*) coeff_start);
+      coeff_start += 16;
+
+      __m256i madd_0 = _mm256_madd_epi16(v_src, v_coeff_0);
+      __m256i madd_1 = _mm256_madd_epi16(v_src, v_coeff_1);
+      __m256i madd_2 = _mm256_madd_epi16(v_src, v_coeff_2);
+      __m256i madd_3 = _mm256_madd_epi16(v_src, v_coeff_3);
+
+      res_0 = _mm256_add_epi32(res_0, madd_0);
+      res_1 = _mm256_add_epi32(res_1, madd_1);
+      res_2 = _mm256_add_epi32(res_2, madd_2);
+      res_3 = _mm256_add_epi32(res_3, madd_3);
+    }
+    __m256i v_trunc_0 = truncate_avx2(res_0, debias, shift_2nd);
+    __m256i v_trunc_1 = truncate_avx2(res_1, debias, shift_2nd);
+    __m256i v_trunc_2 = truncate_avx2(res_2, debias, shift_2nd);
+    __m256i v_trunc_3 = truncate_avx2(res_3, debias, shift_2nd);
+
+    v_trunc_0 = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+    v_trunc_1 = _mm256_packs_epi32(v_trunc_2, v_trunc_3);
+    v_trunc_0 = _mm256_permute4x64_epi64(v_trunc_0, _MM_SHUFFLE(3, 1, 2, 0));
+    v_trunc_1 = _mm256_permute4x64_epi64(v_trunc_1, _MM_SHUFFLE(3, 1, 2, 0));
+    _mm256_store_si256(temp_out + 2 * j, v_trunc_0);
+    _mm256_store_si256(temp_out + 2 * j + 1, v_trunc_1);
+  }
+  transpose_avx2(temp_out, (__m256i*) dst, 32, 2);
+}
+
+
+static void fast_inverse_tr_2x32_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const int64_t* c_ptr = (const int64_t*)coeff; // Handle as 64 bit integer to load four coeffs into vector at the same time
+  const __m256i* v_src_raw = (const __m256i*)src;
+  const __m256i v_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0246);
+
+  __m256i v_src[4];
+  for (int i = 0; i < 4; ++i) {
+    v_src[i] = _mm256_shuffle_epi8(v_src_raw[i], v_shuffle);
+  }
+  for (int i = 0; i < 4; ++i) {
+    v_src[i] = _mm256_permute4x64_epi64(v_src[i], _MM_SHUFFLE(3, 1, 2, 0));
+  }
+
+  __m256i v_add[32];
+  for (int c = 0; c < 32; c++) {
+    const __m256i v_coeff_0 = _mm256_setr_epi64x(c_ptr[0], c_ptr[1], c_ptr[0], c_ptr[1]);
+    const __m256i v_coeff_1 = _mm256_setr_epi64x(c_ptr[2], c_ptr[3], c_ptr[2], c_ptr[3]);
+    const __m256i v_coeff_2 = _mm256_setr_epi64x(c_ptr[4], c_ptr[5], c_ptr[4], c_ptr[5]);
+    const __m256i v_coeff_3 = _mm256_setr_epi64x(c_ptr[6], c_ptr[7], c_ptr[6], c_ptr[7]);
+
+    __m256i v_madd_0 = _mm256_madd_epi16(v_src[0], v_coeff_0);
+    __m256i v_madd_1 = _mm256_madd_epi16(v_src[1], v_coeff_1);
+    __m256i v_madd_2 = _mm256_madd_epi16(v_src[2], v_coeff_2);
+    __m256i v_madd_3 = _mm256_madd_epi16(v_src[3], v_coeff_3);
+
+    __m256i v_add_00 = _mm256_add_epi32(v_madd_0, v_madd_1);
+    __m256i v_add_01 = _mm256_add_epi32(v_madd_2, v_madd_3);
+
+    v_add[c] = _mm256_add_epi32(v_add_00, v_add_01);
+    c_ptr += 8;
+  }
+
+  __m256i v_hadd_0[16];
+  for (int d = 0, s = 0; d < 16; ++d, s += 2) {
+    v_hadd_0[d] = _mm256_hadd_epi32(v_add[s + 0], v_add[s + 1]);
+  }
+
+  __m256i v_hadd_1[8];
+  for (int d = 0, s = 0; d < 8; ++d, s += 2) {
+    v_hadd_1[d] = _mm256_hadd_epi32(v_hadd_0[s + 0], v_hadd_0[s + 1]);
+  }
+
+  __m256i v_trunc[8];
+  for (int i = 0; i < 8; ++i) {
+    v_trunc[i] = truncate_avx2(v_hadd_1[i], debias, shift);
+  }
+
+  __m256i v_tmp0 = _mm256_packs_epi32(v_trunc[0], v_trunc[1]);
+  __m256i v_tmp1 = _mm256_packs_epi32(v_trunc[2], v_trunc[3]);
+  __m256i v_tmp2 = _mm256_packs_epi32(v_trunc[4], v_trunc[5]);
+  __m256i v_tmp3 = _mm256_packs_epi32(v_trunc[6], v_trunc[7]);
+
+  dst[0] = _mm256_permute2x128_si256(v_tmp0, v_tmp1, 0x20);
+  dst[1] = _mm256_permute2x128_si256(v_tmp2, v_tmp3, 0x20);
+  dst[2] = _mm256_permute2x128_si256(v_tmp0, v_tmp1, 0x31);
+  dst[3] = _mm256_permute2x128_si256(v_tmp2, v_tmp3, 0x31);
+}
+
+static void fast_inverse_tr_2x32_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+  const __m256i* v_src_raw = src;
+  const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0415);
+
+  const __m256i v_src_lo0 = _mm256_unpacklo_epi16(v_src_raw[0], v_src_raw[2]);
+  const __m256i v_src_lo1 = _mm256_unpacklo_epi16(v_src_raw[1], v_src_raw[3]);
+  const __m256i v_src_hi0 = _mm256_unpackhi_epi16(v_src_raw[0], v_src_raw[2]);
+  const __m256i v_src_hi1 = _mm256_unpackhi_epi16(v_src_raw[1], v_src_raw[3]);
+
+  __m256i v_trunc_lo_00 = truncate_avx2(_mm256_madd_epi16(v_src_lo0, v_coeff[0]), debias, shift);
+  __m256i v_trunc_lo_01 = truncate_avx2(_mm256_madd_epi16(v_src_lo0, v_coeff[1]), debias, shift);
+  __m256i v_trunc_lo_10 = truncate_avx2(_mm256_madd_epi16(v_src_lo1, v_coeff[0]), debias, shift);
+  __m256i v_trunc_lo_11 = truncate_avx2(_mm256_madd_epi16(v_src_lo1, v_coeff[1]), debias, shift);
+  __m256i v_trunc_hi_00 = truncate_avx2(_mm256_madd_epi16(v_src_hi0, v_coeff[0]), debias, shift);
+  __m256i v_trunc_hi_01 = truncate_avx2(_mm256_madd_epi16(v_src_hi0, v_coeff[1]), debias, shift);
+  __m256i v_trunc_hi_10 = truncate_avx2(_mm256_madd_epi16(v_src_hi1, v_coeff[0]), debias, shift);
+  __m256i v_trunc_hi_11 = truncate_avx2(_mm256_madd_epi16(v_src_hi1, v_coeff[1]), debias, shift);
+
+  __m256i v_result[4];
+  __m256i v_tmp[4];
+  v_tmp[0] = _mm256_shuffle_epi8(_mm256_packs_epi32(v_trunc_lo_00, v_trunc_lo_01), v_res_shuffle);
+  v_tmp[1] = _mm256_shuffle_epi8(_mm256_packs_epi32(v_trunc_lo_10, v_trunc_lo_11), v_res_shuffle);
+  v_tmp[2] = _mm256_shuffle_epi8(_mm256_packs_epi32(v_trunc_hi_00, v_trunc_hi_01), v_res_shuffle);
+  v_tmp[3] = _mm256_shuffle_epi8(_mm256_packs_epi32(v_trunc_hi_10, v_trunc_hi_11), v_res_shuffle);
+
+  v_result[0] = _mm256_permute2x128_si256(v_tmp[0], v_tmp[2], 0x20);
+  v_result[1] = _mm256_permute2x128_si256(v_tmp[0], v_tmp[2], 0x31);
+  v_result[2] = _mm256_permute2x128_si256(v_tmp[1], v_tmp[3], 0x20);
+  v_result[3] = _mm256_permute2x128_si256(v_tmp[1], v_tmp[3], 0x31);
+
+  for (int i = 0; i < 4; ++i) {
+    _mm256_store_si256((__m256i*)dst, v_result[i]);
+    dst += 16;
+  }
+}
+
+void fast_inverse_tr_2x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 2;
+  const int height = 32;
+
+  int skip_width = 0;
+  int skip_height = (ver != DCT2 && height == 32) ? 16 : (height > 32 ? height - 32 : 0);
+
+  const int32_t shift_1st = INVERSE_SHIFT_1ST;
+  const int32_t shift_2nd = INVERSE_SHIFT_2ND;
+
+  const int16_t* ver_coeff = uvg_g_dct_32_t; // rename
+  const int16_t* hor_coeff = fi_dct2_32x2_coeff_ver; // TODO: rename
+  // No coeffs for DCT8 and DST7 transforms since they do not exist for this block size
+
+  __m256i v_ver_pass_out[4];
+  fast_inverse_tr_2x32_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height);
+  
+  fast_inverse_tr_2x32_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width);
+  
+}
+
+
+void fast_forward_tr_4xN_avx2_hor(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = (shift > 0) ? (1 << (shift - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i v_coeff_0 = _mm256_load_si256((const __m256i*) & coeff[0]);
+  const __m256i v_coeff_1 = _mm256_load_si256((const __m256i*) & coeff[16]);
+  const __m256i v_coeff_2 = _mm256_load_si256((const __m256i*) & coeff[32]);
+  const __m256i v_coeff_3 = _mm256_load_si256((const __m256i*) & coeff[48]);
+
+  const __m256i v_permute_0 = _mm256_load_si256((__m256i*)ff_dct2_b4_permute_0);
+  const __m256i v_permute_1 = _mm256_load_si256((__m256i*)ff_dct2_b4_permute_1);
+
+  const int reduced_line = line - skip_line;
+  // Handle 4 lines at a time (16 samples, 4 samples per line)
+  for (int j = 0; j < reduced_line; j += 4) {
+    //                  line 0          line 1            line 2          line 3
+    // src vector:     [s00 s01 s02 s03 s04 s05 s06 s07 | s08 s09 s10 s11 s12 s13 s14 s15]
+    __m256i v_src_raw = _mm256_load_si256((const __m256i*) src);
+
+    // Arrange data for column-wise calculation. Data and coeffs are ordered so no further shuffling
+    // or permutes are needed.
+    // vec 1 : [s00 s01 s04 s05 s08 s09 s12 s13 | s00 s01 s04 s05 s08 s09 s12 s13]
+    // vec 2 : [s02 s03 s06 s07 s10 s11 s14 s15 | s02 s03 s06 s07 s10 s11 s14 s15]
+    __m256i v_src_0 = _mm256_permutevar8x32_epi32(v_src_raw, v_permute_0);
+    __m256i v_src_1 = _mm256_permutevar8x32_epi32(v_src_raw, v_permute_1);
+
+    __m256i v_madd_0 = _mm256_madd_epi16(v_src_0, v_coeff_0);
+    __m256i v_madd_1 = _mm256_madd_epi16(v_src_1, v_coeff_1);
+    __m256i v_madd_2 = _mm256_madd_epi16(v_src_0, v_coeff_2);
+    __m256i v_madd_3 = _mm256_madd_epi16(v_src_1, v_coeff_3);
+
+
+    __m256i v_add_0 = _mm256_add_epi32(v_madd_0, v_madd_1);
+    __m256i v_add_1 = _mm256_add_epi32(v_madd_2, v_madd_3);
+
+    __m256i v_trunc_0 = truncate_avx2(v_add_0, debias, shift);
+    __m256i v_trunc_1 = truncate_avx2(v_add_1, debias, shift);
+
+    dst[0] = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+
+    src += 16;
+    dst += 1;
+  }
+}
+
+void fast_forward_tr_4x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 4;
+  const int height = 4;
+  
+  int skip_width = 0;
+  int skip_height = 0;
+
+  const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1;
+  const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
+  const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8;
+  const int32_t shift_2nd = log2_height_minus1 + 7;
+
+  // TODO: coeffs for DST7 and DCT8
+  const int16_t* hor_coeff = fast_forward_dct2_b4_coeff;
+  const int16_t* ver_coeff = fast_forward_dct2_b4_coeff;
+  if (hor == DST7) {
+    hor_coeff = fast_forward_dst7_b4_coeff;
+  }
+  else if (hor == DCT8) {
+    hor_coeff = fast_forward_dct8_b4_coeff;
+  }
+  if (ver == DST7) {
+    ver_coeff = fast_forward_dst7_b4_coeff;
+  }
+  else if (ver == DCT8) {
+    ver_coeff = fast_forward_dct8_b4_coeff;
+  }
+
+  __m256i v_hor_pass_out;
+  fast_forward_tr_4xN_avx2_hor(src, &v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);
+
+  // Vertical pass
+  const int32_t    add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0;
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i v_coeff_0 = _mm256_load_si256((const __m256i*) & ver_coeff[0]);
+  const __m256i v_coeff_1 = _mm256_load_si256((const __m256i*) & ver_coeff[16]);
+  const __m256i v_coeff_2 = _mm256_load_si256((const __m256i*) & ver_coeff[32]);
+  const __m256i v_coeff_3 = _mm256_load_si256((const __m256i*) & ver_coeff[48]);
+
+  const __m256i v_permute_0 = _mm256_load_si256((__m256i*)ff_dct2_b4_permute_0);
+  const __m256i v_permute_1 = _mm256_load_si256((__m256i*)ff_dct2_b4_permute_1);
+
+  __m256i v_src_0 = _mm256_permutevar8x32_epi32(v_hor_pass_out, v_permute_0);
+  __m256i v_src_1 = _mm256_permutevar8x32_epi32(v_hor_pass_out, v_permute_1);
+
+  __m256i v_madd_0 = _mm256_madd_epi16(v_src_0, v_coeff_0);
+  __m256i v_madd_1 = _mm256_madd_epi16(v_src_1, v_coeff_1);
+  __m256i v_madd_2 = _mm256_madd_epi16(v_src_0, v_coeff_2);
+  __m256i v_madd_3 = _mm256_madd_epi16(v_src_1, v_coeff_3);
+
+  __m256i v_add_0 = _mm256_add_epi32(v_madd_0, v_madd_1);
+  __m256i v_add_1 = _mm256_add_epi32(v_madd_2, v_madd_3);
+
+  __m256i v_trunc_0 = truncate_avx2(v_add_0, debias, shift_2nd);
+  __m256i v_trunc_1 = truncate_avx2(v_add_1, debias, shift_2nd);
+
+  __m256i v_result = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+
+  _mm256_store_si256((__m256i*)dst, v_result);
+}
+
+
+static void fast_inverse_tr_4x4_avx2_hor(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+  const __m256i v_shuffle = _mm256_load_si256((const __m256i*)fi_tr_4x4_shuffle_hor);
+
+  const __m256i v_src_raw = _mm256_load_si256((const __m256i*)src);
+  __m256i v_src = _mm256_shuffle_epi8(v_src_raw, v_shuffle);
+  v_src         = _mm256_permute4x64_epi64(v_src, _MM_SHUFFLE(3, 1, 2, 0));
+  v_src         = _mm256_shuffle_epi32(v_src, _MM_SHUFFLE(3, 1, 2, 0));
+
+  __m256i v_madd_0 = _mm256_madd_epi16(v_src, v_coeff[0]);
+  __m256i v_madd_1 = _mm256_madd_epi16(v_src, v_coeff[1]);
+  __m256i v_madd_2 = _mm256_madd_epi16(v_src, v_coeff[2]);
+  __m256i v_madd_3 = _mm256_madd_epi16(v_src, v_coeff[3]);
+
+  __m256i v_trunc_0 = truncate_avx2(_mm256_hadd_epi32(v_madd_0, v_madd_1), debias, shift);
+  __m256i v_trunc_1 = truncate_avx2(_mm256_hadd_epi32(v_madd_2, v_madd_3), debias, shift);
+
+  dst[0] = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+}
+
+static void fast_inverse_tr_4x4_avx2_ver(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = (shift > 0) ? (1 << (shift - 1)) : 0;
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+  const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)fi_tr_4x4_result_shuffle_ver);
+
+  __m256i v_src = _mm256_permute4x64_epi64(src[0], _MM_SHUFFLE(3, 1, 2, 0));
+  v_src         = _mm256_shuffle_epi32(v_src, _MM_SHUFFLE(3, 1, 2, 0));
+
+  __m256i v_madd_0 = _mm256_madd_epi16(v_src, v_coeff[0]);
+  __m256i v_madd_1 = _mm256_madd_epi16(v_src, v_coeff[1]);
+  __m256i v_madd_2 = _mm256_madd_epi16(v_src, v_coeff[2]);
+  __m256i v_madd_3 = _mm256_madd_epi16(v_src, v_coeff[3]);
+
+  __m256i v_trunc_0 = truncate_avx2(_mm256_hadd_epi32(v_madd_0, v_madd_1), debias, shift);
+  __m256i v_trunc_1 = truncate_avx2(_mm256_hadd_epi32(v_madd_2, v_madd_3), debias, shift);
+
+  __m256i v_result = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+  v_result         = _mm256_shuffle_epi8(v_result, v_res_shuffle);
+
+  _mm256_store_si256((__m256i*)dst, v_result);
+}
+
+void fast_inverse_tr_4x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 4;
+  const int height = 4;
+
+  int skip_width = 0;
+  int skip_height = 0;
+
+  const int32_t shift_1st = INVERSE_SHIFT_1ST;
+  const int32_t shift_2nd = INVERSE_SHIFT_2ND;
+
+  const int16_t* hor_coeff = fi_dct2_4xN_coeff_hor;
+  const int16_t* ver_coeff = fi_dct2_4xN_coeff_hor; // Can use same table for both passes
+  if (hor == DST7) {
+    hor_coeff = fi_dst7_4xN_coeff_hor;
+  } else if (hor == DCT8) {
+    hor_coeff = fi_dct8_4xN_coeff_hor;
+  }
+  if (ver == DST7) {
+    ver_coeff = fi_dst7_4xN_coeff_hor;
+  } else if (ver == DCT8) {
+    ver_coeff = fi_dct8_4xN_coeff_hor;
+  }
+
+  __m256i v_hor_pass_out;
+  fast_inverse_tr_4x4_avx2_hor(src, &v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);
+
+  fast_inverse_tr_4x4_avx2_ver(&v_hor_pass_out, dst, ver_coeff, shift_2nd, width, skip_width, skip_height);
+}
+
+
+void fast_forward_tr_4x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 4;
+  const int height = 8;
+
+  int skip_width = 0;
+  int skip_height = 0;
+
+  const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1;
+  const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
+  const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8;
+  const int32_t shift_2nd = log2_height_minus1 + 7;
+
+  const int16_t* hor_coeff = fast_forward_dct2_b4_coeff;
+  const int16_t* ver_coeff = ff_dct2_4x8_coeff_ver;
+  if (hor == DST7) {
+    hor_coeff = fast_forward_dst7_b4_coeff;
+  } else if (hor == DCT8) {
+    hor_coeff = fast_forward_dct8_b4_coeff;
+  }
+  if (ver == DST7) {
+    ver_coeff = ff_dst7_4x8_coeff_ver;
+  } else if (ver == DCT8) {
+    ver_coeff = ff_dct8_4x8_coeff_ver;
+  }
+
+  __m256i v_hor_pass_out[2];
+  fast_forward_tr_4xN_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);
+
+  // Vertical pass
+  const int32_t    add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0;
+  const __m256i debias = _mm256_set1_epi32(add);
+  const __m256i* v_coeff = (const __m256i*)ver_coeff;
+  
+  __m256i v_madd[2][8];
+  for (int i = 0; i < 8; ++i) {
+    v_madd[0][i] = _mm256_madd_epi16(v_hor_pass_out[0], v_coeff[0]);
+    v_madd[1][i] = _mm256_madd_epi16(v_hor_pass_out[1], v_coeff[1]);
+    v_coeff += 2;
+  }
+
+  __m256i v_add[8];
+  for (int i = 0; i < 8; ++i) {
+    v_add[i] = _mm256_add_epi32(v_madd[0][i], v_madd[1][i]);
+  }
+
+  __m256i v_trunc[4];
+  for (int dst = 0, src = 0; dst < 4; ++dst, src += 2) {
+    v_trunc[dst] = truncate_avx2(_mm256_hadd_epi32(v_add[src + 0], v_add[src + 1]), debias, shift_2nd);
+  }
+
+  __m256i v_result[2];
+  v_result[0] = _mm256_packs_epi32(v_trunc[0], v_trunc[1]);
+  v_result[1] = _mm256_packs_epi32(v_trunc[2], v_trunc[3]);
+
+  // Order results
+  v_result[0] = _mm256_permute4x64_epi64(v_result[0], _MM_SHUFFLE(3, 1, 2, 0));
+  v_result[1] = _mm256_permute4x64_epi64(v_result[1], _MM_SHUFFLE(3, 1, 2, 0));
+
+  v_result[0] = _mm256_shuffle_epi32(v_result[0], _MM_SHUFFLE(3, 1, 2, 0));
+  v_result[1] = _mm256_shuffle_epi32(v_result[1], _MM_SHUFFLE(3, 1, 2, 0));
+
+  _mm256_store_si256((__m256i*)&dst[0],  v_result[0]);
+  _mm256_store_si256((__m256i*)&dst[16], v_result[1]);
+}
+
+
+static void fast_inverse_tr_4x8_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+  const __m256i v_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0415);
+  const __m256i v_permute = _mm256_load_si256((const __m256i*)permute_32b_0415);
+
+  const __m256i* v_src_raw = (const __m256i*)src;
+
+  __m256i v_src_0 = _mm256_shuffle_epi8(v_src_raw[0], v_shuffle);
+  __m256i v_src_1 = _mm256_shuffle_epi8(v_src_raw[1], v_shuffle);
+  v_src_0 = _mm256_permutevar8x32_epi32(v_src_0, v_permute);
+  v_src_1 = _mm256_permutevar8x32_epi32(v_src_1, v_permute);
+
+  __m256i v_madd_00 = _mm256_madd_epi16(v_src_0, v_coeff[0]);
+  __m256i v_madd_10 = _mm256_madd_epi16(v_src_1, v_coeff[1]);
+
+  __m256i v_madd_01 = _mm256_madd_epi16(v_src_0, v_coeff[2]);
+  __m256i v_madd_11 = _mm256_madd_epi16(v_src_1, v_coeff[3]);
+
+  __m256i v_madd_02 = _mm256_madd_epi16(v_src_0, v_coeff[4]);
+  __m256i v_madd_12 = _mm256_madd_epi16(v_src_1, v_coeff[5]);
+
+  __m256i v_madd_03 = _mm256_madd_epi16(v_src_0, v_coeff[6]);
+  __m256i v_madd_13 = _mm256_madd_epi16(v_src_1, v_coeff[7]);
+
+  __m256i v_madd_04 = _mm256_madd_epi16(v_src_0, v_coeff[8]);
+  __m256i v_madd_14 = _mm256_madd_epi16(v_src_1, v_coeff[9]);
+
+  __m256i v_madd_05 = _mm256_madd_epi16(v_src_0, v_coeff[10]);
+  __m256i v_madd_15 = _mm256_madd_epi16(v_src_1, v_coeff[11]);
+
+  __m256i v_madd_06 = _mm256_madd_epi16(v_src_0, v_coeff[12]);
+  __m256i v_madd_16 = _mm256_madd_epi16(v_src_1, v_coeff[13]);
+
+  __m256i v_madd_07 = _mm256_madd_epi16(v_src_0, v_coeff[14]);
+  __m256i v_madd_17 = _mm256_madd_epi16(v_src_1, v_coeff[15]);
+
+  __m256i v_add_0 = _mm256_add_epi32(v_madd_00, v_madd_10);
+  __m256i v_add_1 = _mm256_add_epi32(v_madd_01, v_madd_11);
+  __m256i v_add_2 = _mm256_add_epi32(v_madd_02, v_madd_12);
+  __m256i v_add_3 = _mm256_add_epi32(v_madd_03, v_madd_13);
+  __m256i v_add_4 = _mm256_add_epi32(v_madd_04, v_madd_14);
+  __m256i v_add_5 = _mm256_add_epi32(v_madd_05, v_madd_15);
+  __m256i v_add_6 = _mm256_add_epi32(v_madd_06, v_madd_16);
+  __m256i v_add_7 = _mm256_add_epi32(v_madd_07, v_madd_17);
+
+  __m256i v_hadd_0 = _mm256_hadd_epi32(v_add_0, v_add_1);
+  __m256i v_hadd_1 = _mm256_hadd_epi32(v_add_2, v_add_3);
+  __m256i v_hadd_2 = _mm256_hadd_epi32(v_add_4, v_add_5);
+  __m256i v_hadd_3 = _mm256_hadd_epi32(v_add_6, v_add_7);
+
+  __m256i v_trunc_0 = truncate_avx2(v_hadd_0, debias, shift);
+  __m256i v_trunc_1 = truncate_avx2(v_hadd_1, debias, shift);
+  __m256i v_trunc_2 = truncate_avx2(v_hadd_2, debias, shift);
+  __m256i v_trunc_3 = truncate_avx2(v_hadd_3, debias, shift);
+
+  dst[0] = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+  dst[1] = _mm256_packs_epi32(v_trunc_2, v_trunc_3);
+}
+
+static void fast_inverse_tr_4x8_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+  const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0415);
+
+  __m256i v_src_0 = _mm256_permute2x128_si256(src[0], src[1], 0x20);
+  __m256i v_src_1 = _mm256_permute2x128_si256(src[0], src[1], 0x31);
+
+  __m256i v_madd_00 = _mm256_madd_epi16(v_src_0, v_coeff[0]);
+  __m256i v_madd_10 = _mm256_madd_epi16(v_src_1, v_coeff[1]);
+
+  __m256i v_madd_01 = _mm256_madd_epi16(v_src_0, v_coeff[2]);
+  __m256i v_madd_11 = _mm256_madd_epi16(v_src_1, v_coeff[3]);
+
+  __m256i v_madd_02 = _mm256_madd_epi16(v_src_0, v_coeff[4]);
+  __m256i v_madd_12 = _mm256_madd_epi16(v_src_1, v_coeff[5]);
+
+  __m256i v_madd_03 = _mm256_madd_epi16(v_src_0, v_coeff[6]);
+  __m256i v_madd_13 = _mm256_madd_epi16(v_src_1, v_coeff[7]);
+
+  __m256i v_trunc_0 = truncate_avx2(_mm256_add_epi32(v_madd_00, v_madd_10), debias, shift);
+  __m256i v_trunc_1 = truncate_avx2(_mm256_add_epi32(v_madd_01, v_madd_11), debias, shift);
+  __m256i v_trunc_2 = truncate_avx2(_mm256_add_epi32(v_madd_02, v_madd_12), debias, shift);
+  __m256i v_trunc_3 = truncate_avx2(_mm256_add_epi32(v_madd_03, v_madd_13), debias, shift);
+
+  __m256i v_tmp0 = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+  __m256i v_tmp1 = _mm256_packs_epi32(v_trunc_2, v_trunc_3);
+
+  __m256i v_result_0 = _mm256_permute2x128_si256(v_tmp0, v_tmp1, 0x20);
+  __m256i v_result_1 = _mm256_permute2x128_si256(v_tmp0, v_tmp1, 0x31);
+
+  v_result_0 = _mm256_shuffle_epi8(v_result_0, v_res_shuffle);
+  v_result_1 = _mm256_shuffle_epi8(v_result_1, v_res_shuffle);
+
+  v_result_0 = _mm256_permute4x64_epi64(v_result_0, _MM_SHUFFLE(3, 1, 2, 0));
+  v_result_1 = _mm256_permute4x64_epi64(v_result_1, _MM_SHUFFLE(3, 1, 2, 0));
+
+  v_result_0 = _mm256_shuffle_epi32(v_result_0, _MM_SHUFFLE(3, 1, 2, 0));
+  v_result_1 = _mm256_shuffle_epi32(v_result_1, _MM_SHUFFLE(3, 1, 2, 0));
+
+  _mm256_store_si256((__m256i*) & dst[0], v_result_0);
+  _mm256_store_si256((__m256i*) & dst[16], v_result_1);
+}
+
+void fast_inverse_tr_4x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 4;
+  const int height = 8;
+
+  int skip_width = 0;
+  int skip_height = 0;
+
+  const int32_t shift_1st = INVERSE_SHIFT_1ST;
+  const int32_t shift_2nd = INVERSE_SHIFT_2ND;
+
+  const int16_t* ver_coeff = fi_dct2_8x4_coeff_hor; // TODO: rename coeff tables
+  const int16_t* hor_coeff = fi_dct2_8x4_coeff_ver;
+  if (hor == DST7) {
+    hor_coeff = fi_dst7_8x4_coeff_ver;
+  } else if (hor == DCT8) {
+    hor_coeff = fi_dct8_8x4_coeff_ver;
+  }
+  if (ver == DST7) {
+    ver_coeff = fi_dst7_8x4_coeff_hor;
+  } else if (ver == DCT8) {
+    ver_coeff = fi_dct8_8x4_coeff_hor;
+  }
+
+  __m256i v_ver_pass_out[2];
+  fast_inverse_tr_4x8_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height);
+  
+  fast_inverse_tr_4x8_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width);
+}
+
+
+void fast_forward_tr_4x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 4;
+  const int height = 16;
+
+  int skip_width = 0;
+  int skip_height = 0;
+
+  const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1;
+  const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
+  const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8;
+  const int32_t shift_2nd = log2_height_minus1 + 7;
+
+  const int16_t* hor_coeff = fast_forward_dct2_b4_coeff;
+  const int16_t* ver_coeff = uvg_g_dct_16;
+  if (hor == DST7) {
+    hor_coeff = fast_forward_dst7_b4_coeff;
+  } else if (hor == DCT8) {
+    hor_coeff = fast_forward_dct8_b4_coeff;
+  }
+  if (ver == DST7) {
+    ver_coeff = uvg_g_dst7_16;
+  } else if (ver == DCT8) {
+    ver_coeff = uvg_g_dct8_16;
+  }
+
+  __m256i v_hor_pass_out[4];
+  fast_forward_tr_4xN_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);
+
+  // Vertical pass
+  const int32_t    add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0;
+  const __m256i debias = _mm256_set1_epi32(add);
+  const int64_t* coeff_ptr = (const int64_t*)ver_coeff; // Read four coeffs at once by casting into 64 bit integer
+
+  __m256i v_madd[4][16];
+  for (int i = 0; i < 16; ++i) {
+    const __m256i v_coeff_0 = _mm256_set1_epi64x(coeff_ptr[0]);
+    const __m256i v_coeff_1 = _mm256_set1_epi64x(coeff_ptr[1]);
+    const __m256i v_coeff_2 = _mm256_set1_epi64x(coeff_ptr[2]);
+    const __m256i v_coeff_3 = _mm256_set1_epi64x(coeff_ptr[3]);
+    v_madd[0][i] = _mm256_madd_epi16(v_hor_pass_out[0], v_coeff_0);
+    v_madd[1][i] = _mm256_madd_epi16(v_hor_pass_out[1], v_coeff_1);
+    v_madd[2][i] = _mm256_madd_epi16(v_hor_pass_out[2], v_coeff_2);
+    v_madd[3][i] = _mm256_madd_epi16(v_hor_pass_out[3], v_coeff_3);
+    coeff_ptr += 4;
+  }
+
+  __m256i v_add[16];
+  for (int i = 0; i < 16; ++i) {
+    __m256i v_tmp0 = _mm256_add_epi32(v_madd[0][i], v_madd[1][i]);
+    __m256i v_tmp1 = _mm256_add_epi32(v_madd[2][i], v_madd[3][i]);
+
+    v_add[i] = _mm256_add_epi32(v_tmp0, v_tmp1);
+  }
+
+  __m256i v_trunc[8];
+  for (int dst = 0, src = 0; dst < 8; ++dst, src += 2) {
+    v_trunc[dst] = truncate_avx2(_mm256_hadd_epi32(v_add[src + 0], v_add[src + 1]), debias, shift_2nd);
+  }
+
+  __m256i v_result[4];
+  v_result[0] = _mm256_packs_epi32(v_trunc[0], v_trunc[1]);
+  v_result[1] = _mm256_packs_epi32(v_trunc[2], v_trunc[3]);
+  v_result[2] = _mm256_packs_epi32(v_trunc[4], v_trunc[5]);
+  v_result[3] = _mm256_packs_epi32(v_trunc[6], v_trunc[7]);
+
+  for (int i = 0; i < 4; ++i) {
+    v_result[i] = _mm256_permute4x64_epi64(v_result[i], _MM_SHUFFLE(3, 1, 2, 0));
+  }
+
+  for (int i = 0; i < 4; ++i) {
+    v_result[i] = _mm256_shuffle_epi32(v_result[i], _MM_SHUFFLE(3, 1, 2, 0));
+    _mm256_store_si256((__m256i*)dst, v_result[i]);
+    dst += 16;
+  }
+}
+
+
+static void fast_inverse_tr_4x16_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+  const __m256i* v_src_raw = (const __m256i*)src;
+  const __m256i v_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0415);
+
+  __m256i v_src_0 = _mm256_shuffle_epi8(v_src_raw[0], v_shuffle);
+  __m256i v_src_1 = _mm256_shuffle_epi8(v_src_raw[1], v_shuffle);
+  __m256i v_src_2 = _mm256_shuffle_epi8(v_src_raw[2], v_shuffle);
+  __m256i v_src_3 = _mm256_shuffle_epi8(v_src_raw[3], v_shuffle);
+
+  v_src_0 = _mm256_permute4x64_epi64(v_src_0, _MM_SHUFFLE(3, 1, 2, 0));
+  v_src_1 = _mm256_permute4x64_epi64(v_src_1, _MM_SHUFFLE(3, 1, 2, 0));
+  v_src_2 = _mm256_permute4x64_epi64(v_src_2, _MM_SHUFFLE(3, 1, 2, 0));
+  v_src_3 = _mm256_permute4x64_epi64(v_src_3, _MM_SHUFFLE(3, 1, 2, 0));
+
+  v_src_0 = _mm256_shuffle_epi32(v_src_0, _MM_SHUFFLE(3, 1, 2, 0));
+  v_src_1 = _mm256_shuffle_epi32(v_src_1, _MM_SHUFFLE(3, 1, 2, 0));
+  v_src_2 = _mm256_shuffle_epi32(v_src_2, _MM_SHUFFLE(3, 1, 2, 0));
+  v_src_3 = _mm256_shuffle_epi32(v_src_3, _MM_SHUFFLE(3, 1, 2, 0));
+
+  __m256i v_madd_0[16];
+  __m256i v_madd_1[16];
+  __m256i v_madd_2[16];
+  __m256i v_madd_3[16];
+  for (int c = 0; c < 16; c++) {
+    v_madd_0[c] = _mm256_madd_epi16(v_src_0, v_coeff[0]);
+    v_madd_1[c] = _mm256_madd_epi16(v_src_1, v_coeff[1]);
+    v_madd_2[c] = _mm256_madd_epi16(v_src_2, v_coeff[2]);
+    v_madd_3[c] = _mm256_madd_epi16(v_src_3, v_coeff[3]);
+    v_coeff += 4;
+  }
+
+  __m256i v_add[16];
+  for (int i = 0; i < 16; ++i) {
+    __m256i v_add_0 = _mm256_add_epi32(v_madd_0[i], v_madd_1[i]);
+    __m256i v_add_1 = _mm256_add_epi32(v_madd_2[i], v_madd_3[i]);
+
+    v_add[i] = _mm256_add_epi32(v_add_0, v_add_1);
+  }
+
+  __m256i v_hadd[8];
+  for (int d = 0, s = 0; d < 8; ++d, s += 2) {
+    v_hadd[d] = _mm256_hadd_epi32(v_add[s + 0], v_add[s + 1]);
+  }
+
+  __m256i v_trunc[8];
+  for (int i = 0; i < 8; ++i) {
+    v_trunc[i] = truncate_avx2(v_hadd[i], debias, shift);
+  }
+
+  dst[0] = _mm256_packs_epi32(v_trunc[0], v_trunc[1]);
+  dst[1] = _mm256_packs_epi32(v_trunc[2], v_trunc[3]);
+  dst[2] = _mm256_packs_epi32(v_trunc[4], v_trunc[5]);
+  dst[3] = _mm256_packs_epi32(v_trunc[6], v_trunc[7]);
+}
+
+static void fast_inverse_tr_4x16_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+  const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0415);
+
+  __m256i v_src_0 = _mm256_permute2x128_si256(src[0], src[1], 0x20);
+  __m256i v_src_1 = _mm256_permute2x128_si256(src[0], src[1], 0x31);
+  __m256i v_src_2 = _mm256_permute2x128_si256(src[2], src[3], 0x20);
+  __m256i v_src_3 = _mm256_permute2x128_si256(src[2], src[3], 0x31);
+
+  __m256i v_madd_0[4];
+  __m256i v_madd_1[4];
+  __m256i v_madd_2[4];
+  __m256i v_madd_3[4];
+  for (int c = 0; c < 4; ++c) {
+    v_madd_0[c] = _mm256_madd_epi16(v_src_0, v_coeff[0]);
+    v_madd_1[c] = _mm256_madd_epi16(v_src_1, v_coeff[1]);
+    v_madd_2[c] = _mm256_madd_epi16(v_src_2, v_coeff[0]);
+    v_madd_3[c] = _mm256_madd_epi16(v_src_3, v_coeff[1]);
+    v_coeff += 2;
+  }
+
+  __m256i v_trunc_0[4];
+  __m256i v_trunc_1[4];
+  for (int i = 0; i < 4; ++i) {
+    v_trunc_0[i] = truncate_avx2(_mm256_add_epi32(v_madd_0[i], v_madd_1[i]), debias, shift);
+    v_trunc_1[i] = truncate_avx2(_mm256_add_epi32(v_madd_2[i], v_madd_3[i]), debias, shift);
+  }
+
+  __m256i v_result[4];
+  __m256i v_tmp0 = _mm256_packs_epi32(v_trunc_0[0], v_trunc_0[1]);
+  __m256i v_tmp1 = _mm256_packs_epi32(v_trunc_0[2], v_trunc_0[3]);
+  __m256i v_tmp2 = _mm256_packs_epi32(v_trunc_1[0], v_trunc_1[1]);
+  __m256i v_tmp3 = _mm256_packs_epi32(v_trunc_1[2], v_trunc_1[3]);
+
+  v_tmp0 = _mm256_shuffle_epi8(v_tmp0, v_res_shuffle);
+  v_tmp1 = _mm256_shuffle_epi8(v_tmp1, v_res_shuffle);
+  v_tmp2 = _mm256_shuffle_epi8(v_tmp2, v_res_shuffle);
+  v_tmp3 = _mm256_shuffle_epi8(v_tmp3, v_res_shuffle);
+
+  __m256i v_tmp32_0 = _mm256_unpacklo_epi32(v_tmp0, v_tmp1);
+  __m256i v_tmp32_1 = _mm256_unpackhi_epi32(v_tmp0, v_tmp1);
+  __m256i v_tmp32_2 = _mm256_unpacklo_epi32(v_tmp2, v_tmp3);
+  __m256i v_tmp32_3 = _mm256_unpackhi_epi32(v_tmp2, v_tmp3);
+
+  v_result[0] = _mm256_permute2x128_si256(v_tmp32_0, v_tmp32_1, 0x20);
+  v_result[1] = _mm256_permute2x128_si256(v_tmp32_0, v_tmp32_1, 0x31);
+  v_result[2] = _mm256_permute2x128_si256(v_tmp32_2, v_tmp32_3, 0x20);
+  v_result[3] = _mm256_permute2x128_si256(v_tmp32_2, v_tmp32_3, 0x31);
+
+  for (int i = 0; i < 4; ++i) {
+    _mm256_store_si256((__m256i*)dst, v_result[i]);
+    dst += 16;
+  }
+}
+
+void fast_inverse_tr_4x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 4;
+  const int height = 16;
+
+  int skip_width = 0;
+  int skip_height = 0;
+
+  const int32_t shift_1st = INVERSE_SHIFT_1ST;
+  const int32_t shift_2nd = INVERSE_SHIFT_2ND;
+
+  const int16_t* ver_coeff = fi_dct2_16x4_coeff_hor; // TODO: rename coeff tables
+  const int16_t* hor_coeff = fi_dct2_16x4_coeff_ver;
+  if (hor == DST7) {
+    hor_coeff = fi_dst7_16x4_coeff_ver;
+  } else if (hor == DCT8) {
+    hor_coeff = fi_dct8_16x4_coeff_ver;
+  }
+  if (ver == DST7) {
+    ver_coeff = fi_dst7_16x4_coeff_hor;
+  } else if (ver == DCT8) {
+    ver_coeff = fi_dct8_16x4_coeff_hor;
+  }
+
+  __m256i v_ver_pass_out[4];
+  fast_inverse_tr_4x16_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height);
+
+  fast_inverse_tr_4x16_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width);
+}
+
+
+void fast_forward_tr_4x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 4;
+  const int height = 32;
+
+  int skip_width = 0;
+  int skip_height = (ver != DCT2 && height == 32) ? 16 : (height > 32 ? height - 32 : 0);
+
+  const int  reduced_line = width - skip_width;
+  const int  cutoff = height - skip_height;
+  int16_t* p_dst = dst;
+
+  const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1;
+  const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
+  const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8;
+  const int32_t shift_2nd = log2_height_minus1 + 7;
+
+  const int16_t* hor_coeff = fast_forward_dct2_b4_coeff;
+  const int16_t* ver_coeff = ff_dct2_32xN_coeff_hor;
+  if (hor == DST7) {
+    hor_coeff = fast_forward_dst7_b4_coeff;
+  } else if (hor == DCT8) {
+    hor_coeff = fast_forward_dct8_b4_coeff;
+  }
+  if (ver == DST7) {
+    ver_coeff = ff_dst7_4x32_coeff_ver;
+  } else if (ver == DCT8) {
+    ver_coeff = ff_dct8_4x32_coeff_ver;
+  }
+
+  int16_t v_hor_pass_out[4*32];
+  fast_forward_tr_4xN_avx2_hor(src, (__m256i*)v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);
+
+
+  __m256i temp_out[8];
+  // Vertical pass
+  const int32_t    add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  for (int j = 0; j < 4; ++j) {
+    __m256i res_0 = _mm256_setzero_si256();
+    __m256i res_1 = _mm256_setzero_si256();
+    __m256i res_2 = _mm256_setzero_si256();
+    __m256i res_3 = _mm256_setzero_si256();
+    const int16_t* coeff_start = ver_coeff;
+    const int32_t* temp_source = (int32_t*)(v_hor_pass_out + j * 4);
+    for (int i = 0; i < 16; ++i) {
+
+      __m256i v_src = _mm256_set1_epi32(*temp_source);
+      temp_source += i & 1 ? 7 : 1;
+      __m256i v_coeff_0 = _mm256_load_si256((__m256i*) coeff_start);
+      coeff_start += 16;
+      __m256i v_coeff_1 = _mm256_load_si256((__m256i*) coeff_start);
+      coeff_start += 16;
+      __m256i v_coeff_2 = _mm256_load_si256((__m256i*) coeff_start);
+      coeff_start += 16;
+      __m256i v_coeff_3 = _mm256_load_si256((__m256i*) coeff_start);
+      coeff_start += 16;
+
+      __m256i madd_0 = _mm256_madd_epi16(v_src, v_coeff_0);
+      __m256i madd_1 = _mm256_madd_epi16(v_src, v_coeff_1);
+      __m256i madd_2 = _mm256_madd_epi16(v_src, v_coeff_2);
+      __m256i madd_3 = _mm256_madd_epi16(v_src, v_coeff_3);
+
+      res_0 = _mm256_add_epi32(res_0, madd_0);
+      res_1 = _mm256_add_epi32(res_1, madd_1);
+      res_2 = _mm256_add_epi32(res_2, madd_2);
+      res_3 = _mm256_add_epi32(res_3, madd_3);
+    }
+    __m256i v_trunc_0 = truncate_avx2(res_0, debias, shift_2nd);
+    __m256i v_trunc_1 = truncate_avx2(res_1, debias, shift_2nd);
+    __m256i v_trunc_2 = truncate_avx2(res_2, debias, shift_2nd);
+    __m256i v_trunc_3 = truncate_avx2(res_3, debias, shift_2nd);
+
+    v_trunc_0 = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+    v_trunc_1 = _mm256_packs_epi32(v_trunc_2, v_trunc_3);
+    v_trunc_0 = _mm256_permute4x64_epi64(v_trunc_0, _MM_SHUFFLE(3, 1, 2, 0));
+    v_trunc_1 = _mm256_permute4x64_epi64(v_trunc_1, _MM_SHUFFLE(3, 1, 2, 0));
+    _mm256_store_si256(temp_out + 2 * j, v_trunc_0);
+    _mm256_store_si256(temp_out + 2 * j + 1, v_trunc_1);
+  }
+  transpose_avx2(temp_out, (__m256i*) dst, 32, 4);
+
+  if (skip_width) {
+    dst = p_dst + reduced_line;
+    for (int j = 0; j < cutoff; j++)
+    {
+      memset(dst, 0, sizeof(int16_t) * skip_width);
+      dst += width;
+    }
+  }
+
+  if (skip_height) {
+    dst = p_dst + width * cutoff;
+    memset(dst, 0, sizeof(int16_t) * width * skip_height);
+  }
+}
+
+
+static void fast_inverse_tr_4x32_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const int64_t* c_ptr = (const int64_t*)coeff; // Handle as 64 bit integer to load four coeffs into vector at the same time
+  const __m256i* v_src_raw = (const __m256i*)src;
+  const __m256i v_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0415);
+
+  __m256i v_src[8];
+  for (int i = 0; i < 8; ++i) {
+    v_src[i] = _mm256_shuffle_epi8(v_src_raw[i], v_shuffle);
+  }
+  for (int i = 0; i < 8; ++i) {
+    v_src[i] = _mm256_permute4x64_epi64(v_src[i], _MM_SHUFFLE(3, 1, 2, 0));
+    v_src[i] = _mm256_shuffle_epi32(v_src[i], _MM_SHUFFLE(3, 1, 2, 0));
+  }
+
+  __m256i v_add[32];
+  for (int c = 0; c < 32; c++) {
+    __m256i v_madd[8];
+    for (int i = 0; i < 8; ++i) {
+      const __m256i v_coeff = _mm256_set1_epi64x(*c_ptr);
+      v_madd[i] = _mm256_madd_epi16(v_src[i], v_coeff);
+      c_ptr++;
+    }
+
+    __m256i v_add_0[4];
+    for (int d = 0, s = 0; d < 4; ++d, s += 2) {
+      v_add_0[d] = _mm256_add_epi32(v_madd[s + 0], v_madd[s + 1]);
+    }
+
+    __m256i v_add_10 = _mm256_add_epi32(v_add_0[0], v_add_0[1]);
+    __m256i v_add_11 = _mm256_add_epi32(v_add_0[2], v_add_0[3]);
+
+    v_add[c] = _mm256_add_epi32(v_add_10, v_add_11);
+  }
+
+  __m256i v_hadd[16];
+  for (int d = 0, s = 0; d < 16; ++d, s += 2) {
+    v_hadd[d] = _mm256_hadd_epi32(v_add[s + 0], v_add[s + 1]);
+  }
+
+  __m256i v_trunc[16];
+  for (int i = 0; i < 16; ++i) {
+    v_trunc[i] = truncate_avx2(v_hadd[i], debias, shift);
+  }
+
+  for (int d = 0, s = 0; d < 8; ++d, s += 2) {
+    dst[d] = _mm256_packs_epi32(v_trunc[s + 0], v_trunc[s + 1]);
+  }
+  // TODO: cutoff for dct8 and dst7
+}
+
+static void fast_inverse_tr_4x32_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+  const __m256i* v_src_raw = src;
+
+  __m256i v_src[8];
+  __m256i v_tmp[8];
+  v_src[0] = _mm256_permute2x128_si256(v_src_raw[0], v_src_raw[1], 0x20);
+  v_src[1] = _mm256_permute2x128_si256(v_src_raw[0], v_src_raw[1], 0x31);
+  v_src[2] = _mm256_permute2x128_si256(v_src_raw[2], v_src_raw[3], 0x20);
+  v_src[3] = _mm256_permute2x128_si256(v_src_raw[2], v_src_raw[3], 0x31);
+  v_src[4] = _mm256_permute2x128_si256(v_src_raw[4], v_src_raw[5], 0x20);
+  v_src[5] = _mm256_permute2x128_si256(v_src_raw[4], v_src_raw[5], 0x31);
+  v_src[6] = _mm256_permute2x128_si256(v_src_raw[6], v_src_raw[7], 0x20);
+  v_src[7] = _mm256_permute2x128_si256(v_src_raw[6], v_src_raw[7], 0x31);
+
+  for (int d = 0, c = 0; c < 4; ++c, d += 2) {
+    __m256i v_madd_00 = _mm256_madd_epi16(v_src[0], v_coeff[0]);
+    __m256i v_madd_01 = _mm256_madd_epi16(v_src[1], v_coeff[1]);
+    __m256i v_madd_10 = _mm256_madd_epi16(v_src[2], v_coeff[0]);
+    __m256i v_madd_11 = _mm256_madd_epi16(v_src[3], v_coeff[1]);
+    __m256i v_madd_20 = _mm256_madd_epi16(v_src[4], v_coeff[0]);
+    __m256i v_madd_21 = _mm256_madd_epi16(v_src[5], v_coeff[1]);
+    __m256i v_madd_30 = _mm256_madd_epi16(v_src[6], v_coeff[0]);
+    __m256i v_madd_31 = _mm256_madd_epi16(v_src[7], v_coeff[1]);
+    v_coeff += 2;
+
+    __m256i v_trunc_0 = truncate_avx2(_mm256_add_epi32(v_madd_00, v_madd_01), debias, shift);
+    __m256i v_trunc_1 = truncate_avx2(_mm256_add_epi32(v_madd_10, v_madd_11), debias, shift);
+    __m256i v_trunc_2 = truncate_avx2(_mm256_add_epi32(v_madd_20, v_madd_21), debias, shift);
+    __m256i v_trunc_3 = truncate_avx2(_mm256_add_epi32(v_madd_30, v_madd_31), debias, shift);
+
+    v_tmp[d + 0] = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+    v_tmp[d + 1] = _mm256_packs_epi32(v_trunc_2, v_trunc_3);
+
+    v_tmp[d + 0] = _mm256_permute4x64_epi64(v_tmp[d + 0], _MM_SHUFFLE(3, 1, 2, 0));
+    v_tmp[d + 1] = _mm256_permute4x64_epi64(v_tmp[d + 1], _MM_SHUFFLE(3, 1, 2, 0));
+  }
+
+  __m256i v_result[8];
+  transpose_avx2(v_tmp, v_result, 32, 4);
+
+  for (int i = 0; i < 8; ++i) {
+    _mm256_store_si256((__m256i*)dst, v_result[i]);
+    dst += 16;
+  }
+}
+
+void fast_inverse_tr_4x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 4;
+  const int height = 32;
+
+  int skip_width = 0;
+  int skip_height = (ver != DCT2 && height == 32) ? 16 : (height > 32 ? height - 32 : 0);
+
+  const int32_t shift_1st = INVERSE_SHIFT_1ST;
+  const int32_t shift_2nd = INVERSE_SHIFT_2ND;
+
+  const int16_t* ver_coeff = uvg_g_dct_32_t;
+  const int16_t* hor_coeff = fi_dct2_32x4_coeff_ver; // TODO: rename
+  if (hor == DST7) {
+    hor_coeff = fi_dst7_32x4_coeff_ver; // TODO: rename
+  } else if (hor == DCT8) {
+    hor_coeff = fi_dct8_32x4_coeff_ver; // TODO: rename
+  }
+  if (ver == DST7) {
+    ver_coeff = uvg_g_dst7_32_t;
+  } else if (ver == DCT8) {
+    ver_coeff = uvg_g_dct8_32;
+  }
+
+  __m256i v_ver_pass_out[8];
+  fast_inverse_tr_4x32_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height);
+
+  fast_inverse_tr_4x32_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width);
+}
+
+
+static void fast_forward_tr_8xN_avx2_hor(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = (shift > 0) ? (1 << (shift - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+
+  const int reduced_line = line - skip_line;
+  // Handle 2 lines at a time (16 samples, 8 samples per line)
+  for (int j = 0; j < reduced_line; j += 2) {
+    //                    line 1                    line 2
+    // src vector:       [s0 s1 s2 s3 s4 s5 s6 s7 | s0 s1 s2 s3 s4 s5 s6 s7]
+    __m256i    v_src = _mm256_load_si256((const __m256i*)src);
+
+    // Rearrange source in a way samples can be added together column-wise using add
+    // after first round of madd operations.
+    // Need 4 source vectors arranged as follows. High 128 lanes are the same as low:
+    // vec_01 = [s0 s1 s0 s1 s0 s1 s0 s1 |...]
+    // vec_02 = [s2 s3 s2 s3 s2 s3 s2 s3 |...]
+    // vec_03 = [s4 s5 s4 s5 s4 s5 s4 s5 |...]
+    // vec_04 = [s6 s7 s6 s7 s6 s7 s6 s7 |...]
+
+    __m256i  v_src_0 = _mm256_shuffle_epi32(v_src, _MM_SHUFFLE(0, 0, 0, 0));
+    __m256i  v_src_1 = _mm256_shuffle_epi32(v_src, _MM_SHUFFLE(1, 1, 1, 1));
+    __m256i  v_src_2 = _mm256_shuffle_epi32(v_src, _MM_SHUFFLE(2, 2, 2, 2));
+    __m256i  v_src_3 = _mm256_shuffle_epi32(v_src, _MM_SHUFFLE(3, 3, 3, 3));
+
+    // Lane 1
+    __m256i v_madd_0 = _mm256_madd_epi16(v_src_0, v_coeff[0]);
+    __m256i v_madd_1 = _mm256_madd_epi16(v_src_1, v_coeff[1]);
+    __m256i v_madd_2 = _mm256_madd_epi16(v_src_2, v_coeff[2]);
+    __m256i v_madd_3 = _mm256_madd_epi16(v_src_3, v_coeff[3]);
+
+    __m256i v_add_00 = _mm256_add_epi32(v_madd_0, v_madd_1);
+    __m256i v_add_01 = _mm256_add_epi32(v_madd_2, v_madd_3);
+
+    __m256i v_add_10 = _mm256_add_epi32(v_add_00, v_add_01);
+
+    // Lane 2
+    __m256i v_madd_4 = _mm256_madd_epi16(v_src_0, v_coeff[4]);
+    __m256i v_madd_5 = _mm256_madd_epi16(v_src_1, v_coeff[5]);
+    __m256i v_madd_6 = _mm256_madd_epi16(v_src_2, v_coeff[6]);
+    __m256i v_madd_7 = _mm256_madd_epi16(v_src_3, v_coeff[7]);
+
+    __m256i v_add_02 = _mm256_add_epi32(v_madd_4, v_madd_5);
+    __m256i v_add_03 = _mm256_add_epi32(v_madd_6, v_madd_7);
+
+    __m256i v_add_11 = _mm256_add_epi32(v_add_02, v_add_03);
+
+    // Trunc results from both lanes
+    __m256i v_trunc_0 = truncate_avx2(v_add_10, debias, shift);
+    __m256i v_trunc_1 = truncate_avx2(v_add_11, debias, shift);
+
+    dst[0] = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+
+    src += 16;
+    dst += 1;
+  }
+}
+
+void fast_forward_tr_8x2_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 8;
+  const int height = 2;
+  
+  int skip_width  = 0;
+  int skip_height = 0;
+
+  const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1;
+  const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
+  const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8;
+  const int32_t shift_2nd = log2_height_minus1 + 7;
+
+  const int16_t* hor_coeff = ff_dct2_8xN_coeff_hor;
+  const int16_t* ver_coeff = ff_dct2_8x2_coeff_ver;
+  // Only DCT2 is defined for 8x2 block
+  if (hor == DST7) {
+    hor_coeff = ff_dst7_8xN_coeff_hor;
+  } else if (hor == DCT8) {
+    hor_coeff = ff_dct8_8xN_coeff_hor;
+  }
+
+  __m256i v_hor_pass_out;
+  fast_forward_tr_8xN_avx2_hor(src, &v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);
+
+  // Vertical pass
+  const int32_t    add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  // TODO: coeffs for DST7 and DCT8 transforms
+  const __m256i* v_coeff = (const __m256i*)ver_coeff;
+  const __m256i v_shuffle = _mm256_load_si256((const __m256i*)ff_dct2_8x2_ver_pass_shuffle);
+
+  // 8x2, only 16 samples, handle all at once
+  __m256i v_src_per = _mm256_permute4x64_epi64(v_hor_pass_out, _MM_SHUFFLE(3, 1, 2, 0));
+  // Weave lo and hi halfs of each 128 bit lane
+  __m256i     v_src = _mm256_shuffle_epi8(v_src_per, v_shuffle);
+  //            v_src = _mm256_unpackhi_epi16(v_src_raw, v_src_swp);
+
+  __m256i v_madd_0 = _mm256_madd_epi16(v_src, v_coeff[0]);
+  __m256i v_madd_1 = _mm256_madd_epi16(v_src, v_coeff[1]);
+
+  __m256i v_trunc_0 = truncate_avx2(v_madd_0, debias, shift_2nd);
+  __m256i v_trunc_1 = truncate_avx2(v_madd_1, debias, shift_2nd);
+
+  __m256i v_result = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+          v_result = _mm256_permute4x64_epi64(v_result, _MM_SHUFFLE(3, 1, 2, 0)); // TODO: this permute can probably be optimized away
+
+  _mm256_store_si256((__m256i*)dst, v_result);
+}
+
+
+static void fast_inverse_tr_8x2_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i v_shuffle = _mm256_load_si256((const __m256i*)fi_tr_2x8_shuffle_hor);
+  const __m256i v_coeff_0 = _mm256_load_si256((const __m256i*) & coeff[0]);
+  const __m256i v_coeff_1 = _mm256_load_si256((const __m256i*) & coeff[16]);
+
+  // Got data for one vector
+  const __m256i v_src_raw = _mm256_load_si256((const __m256i*)src);
+
+  __m256i v_src = _mm256_permute4x64_epi64(v_src_raw, _MM_SHUFFLE(3, 1, 2, 0));
+  v_src = _mm256_shuffle_epi8(v_src, v_shuffle);
+
+  __m256i    v_even = _mm256_madd_epi16(v_src, v_coeff_0);
+  // odd vector : [a00-a01 a02-a03 a04-a05 a06-a07|a08-a09 a10-a11 a12-a13 a14-a15]
+  __m256i     v_odd = _mm256_madd_epi16(v_src, v_coeff_1);
+
+  __m256i v_trunc_0 = truncate_avx2(v_even, debias, shift);
+  __m256i v_trunc_1 = truncate_avx2(v_odd, debias, shift);
+
+  dst[0] = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+}
+
+static void fast_inverse_tr_8x2_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = (shift > 0) ? (1 << (shift - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+  const __m256i v_shuffle1 = _mm256_load_si256((const __m256i*)fi_tr_2x8_result_shuffle1_ver);
+  const __m256i v_shuffle2 = _mm256_load_si256((const __m256i*)fi_tr_2x8_result_shuffle2_ver);
+
+  __m256i v_madd_0 = _mm256_madd_epi16(src[0], v_coeff[0]);
+  __m256i v_madd_1 = _mm256_madd_epi16(src[0], v_coeff[1]);
+  __m256i v_madd_2 = _mm256_madd_epi16(src[0], v_coeff[2]);
+  __m256i v_madd_3 = _mm256_madd_epi16(src[0], v_coeff[3]);
+  __m256i v_madd_4 = _mm256_madd_epi16(src[0], v_coeff[4]);
+  __m256i v_madd_5 = _mm256_madd_epi16(src[0], v_coeff[5]);
+  __m256i v_madd_6 = _mm256_madd_epi16(src[0], v_coeff[6]);
+  __m256i v_madd_7 = _mm256_madd_epi16(src[0], v_coeff[7]);
+
+  __m256i v_add_0 = _mm256_add_epi32(v_madd_0, v_madd_1);
+  __m256i v_add_1 = _mm256_add_epi32(v_madd_2, v_madd_3);
+  __m256i v_add_2 = _mm256_add_epi32(v_madd_4, v_madd_5);
+  __m256i v_add_3 = _mm256_add_epi32(v_madd_6, v_madd_7);
+
+  __m256i v_trunc_0 = truncate_avx2(_mm256_hadd_epi32(v_add_0, v_add_1), debias, shift);
+  __m256i v_trunc_1 = truncate_avx2(_mm256_hadd_epi32(v_add_2, v_add_3), debias, shift);
+
+  __m256i v_result = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+  v_result = _mm256_shuffle_epi8(v_result, v_shuffle1);
+  v_result = _mm256_permute4x64_epi64(v_result, _MM_SHUFFLE(3, 1, 2, 0));
+  v_result = _mm256_shuffle_epi8(v_result, v_shuffle2);
+
+  _mm256_store_si256((__m256i*)dst, v_result);
+}
+
+void fast_inverse_tr_8x2_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 8;
+  const int height = 2;
+
+  int skip_width = 0;
+  int skip_height = 0;
+
+  const int32_t shift_1st = INVERSE_SHIFT_1ST;
+  const int32_t shift_2nd = INVERSE_SHIFT_2ND;
+
+  const int16_t* ver_coeff = ff_dct2_2xN_coeff_hor; // TODO: rename
+  const int16_t* hor_coeff = fi_dct2_2x8_coeff_ver; // rename
+  // Only dct2 transform is defined for this block size
+
+  __m256i v_ver_pass_out;
+  fast_inverse_tr_8x2_avx2_ver(src, &v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height);
+  
+  fast_inverse_tr_8x2_avx2_hor(&v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width);
+}
+
+
+void fast_forward_tr_8x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 8;
+  const int height = 4;
+  
+  int skip_width = 0;
+  int skip_height = 0;
+
+  const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1;
+  const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
+  const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8;
+  const int32_t shift_2nd = log2_height_minus1 + 7;
+
+  const int16_t* hor_coeff = ff_dct2_8xN_coeff_hor;
+  const int16_t* ver_coeff = ff_dct2_8x4_coeff_ver;
+  if (hor == DST7) {
+    hor_coeff = ff_dst7_8xN_coeff_hor;
+  } else if (hor == DCT8) {
+    hor_coeff = ff_dct8_8xN_coeff_hor;
+  }
+  if (ver == DST7) {
+    ver_coeff = ff_dst7_8x4_coeff_ver;
+  } else if (ver == DCT8) {
+    ver_coeff = ff_dct8_8x4_coeff_ver;
+  }
+  
+  __m256i v_hor_pass_out[2];
+  fast_forward_tr_8xN_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);
+
+  // Vertical pass
+  const int32_t    add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i     v_shuffle = _mm256_load_si256((const __m256i*)ff_dct2_8x4_ver_pass_shuffle);
+  const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)ff_dct2_8x4_ver_pass_result_shuffle);
+  const __m256i*      v_coeff = (const __m256i*)ver_coeff;
+
+  // 32 samples, process in two steps
+  __m256i v_src_per_0 = _mm256_permute4x64_epi64(v_hor_pass_out[0], _MM_SHUFFLE(3, 1, 2, 0));
+  __m256i v_src_per_1 = _mm256_permute4x64_epi64(v_hor_pass_out[1], _MM_SHUFFLE(3, 1, 2, 0));
+  // Weave lo and hi halfs of each 128 bit lane
+  __m256i     v_src_0 = _mm256_shuffle_epi8(v_src_per_0, v_shuffle);
+  __m256i     v_src_1 = _mm256_shuffle_epi8(v_src_per_1, v_shuffle);
+
+  __m256i   v_madd_00 = _mm256_madd_epi16(v_src_0, v_coeff[0]);
+  __m256i   v_madd_01 = _mm256_madd_epi16(v_src_0, v_coeff[2]);
+  __m256i   v_madd_02 = _mm256_madd_epi16(v_src_0, v_coeff[4]);
+  __m256i   v_madd_03 = _mm256_madd_epi16(v_src_0, v_coeff[6]);
+
+  __m256i   v_madd_10 = _mm256_madd_epi16(v_src_1, v_coeff[1]);
+  __m256i   v_madd_11 = _mm256_madd_epi16(v_src_1, v_coeff[3]);
+  __m256i   v_madd_12 = _mm256_madd_epi16(v_src_1, v_coeff[5]);
+  __m256i   v_madd_13 = _mm256_madd_epi16(v_src_1, v_coeff[7]);
+
+  __m256i     v_add_0 = _mm256_add_epi32(v_madd_00, v_madd_10);
+  __m256i     v_add_1 = _mm256_add_epi32(v_madd_01, v_madd_11);
+  __m256i     v_add_2 = _mm256_add_epi32(v_madd_02, v_madd_12);
+  __m256i     v_add_3 = _mm256_add_epi32(v_madd_03, v_madd_13);
+
+  __m256i   v_trunc_0 = truncate_avx2(v_add_0, debias, shift_2nd);
+  __m256i   v_trunc_1 = truncate_avx2(v_add_1, debias, shift_2nd);
+  __m256i   v_trunc_2 = truncate_avx2(v_add_2, debias, shift_2nd);
+  __m256i   v_trunc_3 = truncate_avx2(v_add_3, debias, shift_2nd);
+           
+  __m256i  v_result_0 = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+  __m256i  v_result_1 = _mm256_packs_epi32(v_trunc_2, v_trunc_3);
+
+  // Swap each middle 64 bit chunk in both 128 bit lanes
+  v_result_0 = _mm256_permute4x64_epi64(v_result_0, _MM_SHUFFLE(3, 1, 2, 0));
+  v_result_1 = _mm256_permute4x64_epi64(v_result_1, _MM_SHUFFLE(3, 1, 2, 0));
+
+  // Swap each middle 16 bit value in each 64 bit chunk
+  v_result_0 = _mm256_shuffle_epi8(v_result_0, v_res_shuffle);
+  v_result_1 = _mm256_shuffle_epi8(v_result_1, v_res_shuffle);
+
+  _mm256_store_si256((__m256i*)dst, v_result_0);
+  _mm256_store_si256((__m256i*)(dst + 16), v_result_1);
+}
+
+
+static void fast_inverse_tr_8x4_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+
+  const __m256i v_src_raw_0 = _mm256_load_si256((const __m256i*) & src[0]);
+  const __m256i v_src_raw_1 = _mm256_load_si256((const __m256i*) & src[16]);
+
+  __m256i v_src_lo = _mm256_unpacklo_epi16(v_src_raw_0, v_src_raw_1);
+  __m256i v_src_hi = _mm256_unpackhi_epi16(v_src_raw_0, v_src_raw_1);
+
+  __m256i v_src_0 = _mm256_permute2x128_si256(v_src_lo, v_src_hi, 0x20);
+  __m256i v_src_1 = _mm256_permute2x128_si256(v_src_lo, v_src_hi, 0x31);
+
+  __m256i v_madd_00 = _mm256_madd_epi16(v_src_0, v_coeff[0]);
+  __m256i v_madd_01 = _mm256_madd_epi16(v_src_0, v_coeff[1]);
+  __m256i v_madd_02 = _mm256_madd_epi16(v_src_0, v_coeff[2]);
+  __m256i v_madd_03 = _mm256_madd_epi16(v_src_0, v_coeff[3]);
+
+  __m256i v_madd_10 = _mm256_madd_epi16(v_src_1, v_coeff[4]);
+  __m256i v_madd_11 = _mm256_madd_epi16(v_src_1, v_coeff[5]);
+  __m256i v_madd_12 = _mm256_madd_epi16(v_src_1, v_coeff[6]);
+  __m256i v_madd_13 = _mm256_madd_epi16(v_src_1, v_coeff[7]);
+
+  __m256i v_trunc_0 = truncate_avx2(_mm256_add_epi32(v_madd_00, v_madd_10), debias, shift);
+  __m256i v_trunc_1 = truncate_avx2(_mm256_add_epi32(v_madd_01, v_madd_11), debias, shift);
+  __m256i v_trunc_2 = truncate_avx2(_mm256_add_epi32(v_madd_02, v_madd_12), debias, shift);
+  __m256i v_trunc_3 = truncate_avx2(_mm256_add_epi32(v_madd_03, v_madd_13), debias, shift);
+
+  dst[0] = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+  dst[1] = _mm256_packs_epi32(v_trunc_2, v_trunc_3);
+}
+
+static void fast_inverse_tr_8x4_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+  const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)fi_tr_4x8_result_shuffle_ver);
+
+  __m256i v_src_0 = _mm256_permute2x128_si256(src[0], src[1], 0x20);
+  __m256i v_src_1 = _mm256_permute2x128_si256(src[0], src[1], 0x31);
+
+  __m256i v_madd_0[8];
+  __m256i v_madd_1[8];
+  for (int i = 0; i < 8; ++i) {
+    v_madd_0[i] = _mm256_madd_epi16(v_src_0, v_coeff[0]);
+    v_madd_1[i] = _mm256_madd_epi16(v_src_1, v_coeff[1]);
+
+    v_coeff += 2;
+  }
+
+  __m256i v_add[8];
+  for (int i = 0; i < 8; ++i) {
+    v_add[i] = _mm256_add_epi32(v_madd_0[i], v_madd_1[i]);
+  }
+
+  __m256i v_hadd[4];
+  for (int dst = 0, src = 0; dst < 4; ++dst, src += 2) {
+    v_hadd[dst] = _mm256_hadd_epi32(v_add[src + 0], v_add[src + 1]);
+  }
+
+  __m256i v_trunc[4];
+  for (int i = 0; i < 4; ++i) {
+    v_trunc[i] = truncate_avx2(v_hadd[i], debias, shift);
+  }
+
+  __m256i v_result[2];
+  v_result[0] = _mm256_packs_epi32(v_trunc[0], v_trunc[1]);
+  v_result[1] = _mm256_packs_epi32(v_trunc[2], v_trunc[3]);
+
+  v_result[0] = _mm256_shuffle_epi8(v_result[0], v_res_shuffle);
+  v_result[1] = _mm256_shuffle_epi8(v_result[1], v_res_shuffle);
+
+  __m256i v_tmp0 = _mm256_permute2x128_si256(v_result[0], v_result[1], 0x20);
+  __m256i v_tmp1 = _mm256_permute2x128_si256(v_result[0], v_result[1], 0x31);
+
+  v_result[0] = _mm256_permute4x64_epi64(v_tmp0, _MM_SHUFFLE(3, 1, 2, 0));
+  v_result[1] = _mm256_permute4x64_epi64(v_tmp1, _MM_SHUFFLE(3, 1, 2, 0));
+
+  _mm256_store_si256((__m256i*) & dst[0], v_result[0]);
+  _mm256_store_si256((__m256i*) & dst[16], v_result[1]);
+}
+
+void fast_inverse_tr_8x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 8;
+  const int height = 4;
+
+  int skip_width = 0;
+  int skip_height = 0;
+
+  const int32_t shift_1st = INVERSE_SHIFT_1ST;
+  const int32_t shift_2nd = INVERSE_SHIFT_2ND;
+
+  const int16_t* ver_coeff = fi_dct2_4x8_coeff_hor; // TODO: rename coeff tables
+  const int16_t* hor_coeff = fi_dct2_4x8_coeff_ver;
+  if (hor == DST7) {
+    hor_coeff = fi_dst7_4x8_coeff_ver;
+  } else if (hor == DCT8) {
+    hor_coeff = fi_dct8_4x8_coeff_ver;
+  }
+  if (ver == DST7) {
+    ver_coeff = fi_dst7_4x8_coeff_hor;
+  } else if (ver == DCT8) {
+    ver_coeff = fi_dct8_4x8_coeff_hor;
+  }
+
+  __m256i v_ver_pass_out[2];
+  fast_inverse_tr_8x4_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height);
+  
+  fast_inverse_tr_8x4_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width);
+}
+
+
+void fast_forward_tr_8x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 8;
+  const int height = 8;
+  
+  int skip_width = 0;
+  int skip_height = 0;
+
+  const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1;
+  const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
+  const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8;
+  const int32_t shift_2nd = log2_height_minus1 + 7;
+
+  const int16_t* hor_coeff = ff_dct2_8xN_coeff_hor;
+  const int16_t* ver_coeff = ff_dct2_8x8_coeff_ver;
+  if (hor == DST7) {
+    hor_coeff = ff_dst7_8xN_coeff_hor;
+  } else if (hor == DCT8) {
+    hor_coeff = ff_dct8_8xN_coeff_hor;
+  }
+  if (ver == DST7) {
+    ver_coeff = ff_dst7_8x8_coeff_ver;
+  } else if (ver == DCT8) {
+    ver_coeff = ff_dct8_8x8_coeff_ver;
+  }
+
+  __m256i v_hor_pass_out[4];
+  fast_forward_tr_8xN_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);
+
+  // Vertical pass
+  const int32_t    add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const __m256i debias = _mm256_set1_epi32(add);
+  const int32_t* coeff_ptr = (const int32_t*)ver_coeff; // Cast into 32 bit integer to read two coeffs at a time
+
+  __m256i v_src_lo_0 = _mm256_unpacklo_epi16(v_hor_pass_out[0], v_hor_pass_out[1]);
+  __m256i v_src_lo_1 = _mm256_unpacklo_epi16(v_hor_pass_out[2], v_hor_pass_out[3]);
+  __m256i v_src_hi_0 = _mm256_unpackhi_epi16(v_hor_pass_out[0], v_hor_pass_out[1]);
+  __m256i v_src_hi_1 = _mm256_unpackhi_epi16(v_hor_pass_out[2], v_hor_pass_out[3]);
+
+  __m256i v_trunc[8];
+
+  __m256i v_src_0 = _mm256_permute2x128_si256(v_src_lo_0, v_src_hi_0, 0x20);
+  __m256i v_src_1 = _mm256_permute2x128_si256(v_src_lo_0, v_src_hi_0, 0x31);
+  __m256i v_src_2 = _mm256_permute2x128_si256(v_src_lo_1, v_src_hi_1, 0x20);
+  __m256i v_src_3 = _mm256_permute2x128_si256(v_src_lo_1, v_src_hi_1, 0x31);
+
+  for (int i = 0; i < 8; ++i) {
+    __m256i v_coeff_0 = _mm256_set1_epi32(coeff_ptr[0]);
+    __m256i v_coeff_1 = _mm256_set1_epi32(coeff_ptr[1]);
+    __m256i v_coeff_2 = _mm256_set1_epi32(coeff_ptr[2]);
+    __m256i v_coeff_3 = _mm256_set1_epi32(coeff_ptr[3]);
+
+    __m256i v_madd_0 = _mm256_madd_epi16(v_src_0, v_coeff_0);
+    __m256i v_madd_1 = _mm256_madd_epi16(v_src_1, v_coeff_1);
+    __m256i v_madd_2 = _mm256_madd_epi16(v_src_2, v_coeff_2);
+    __m256i v_madd_3 = _mm256_madd_epi16(v_src_3, v_coeff_3);
+
+    __m256i v_add_0 = _mm256_add_epi32(v_madd_0, v_madd_1);
+    __m256i v_add_1 = _mm256_add_epi32(v_madd_2, v_madd_3);
+
+    v_trunc[i] = truncate_avx2(_mm256_add_epi32(v_add_0, v_add_1), debias, shift_2nd);
+    coeff_ptr += 4;
+  }
+
+  __m256i v_result[4];
+  v_result[0] = _mm256_packs_epi32(v_trunc[0], v_trunc[1]);
+  v_result[1] = _mm256_packs_epi32(v_trunc[2], v_trunc[3]);
+  v_result[2] = _mm256_packs_epi32(v_trunc[4], v_trunc[5]);
+  v_result[3] = _mm256_packs_epi32(v_trunc[6], v_trunc[7]);
+
+  for (int i = 0; i < 4; ++i) {
+    v_result[i] = _mm256_permute4x64_epi64(v_result[i], _MM_SHUFFLE(3, 1, 2, 0));
+  }
+  
+  for (int i = 0; i < 4; ++i) {
+    _mm256_store_si256((__m256i*)dst, v_result[i]);
+    dst += 16;
+  }
+}
+
+
+static void fast_inverse_tr_8x8_avx2_hor(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+  const __m256i v_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0415);
+
+  const __m256i* v_src_raw = (const __m256i*)src;
+
+  __m256i v_src[4];
+  v_src[0] = _mm256_permute4x64_epi64(v_src_raw[0], _MM_SHUFFLE(3, 1, 2, 0));
+  v_src[1] = _mm256_permute4x64_epi64(v_src_raw[1], _MM_SHUFFLE(3, 1, 2, 0));
+  v_src[2] = _mm256_permute4x64_epi64(v_src_raw[2], _MM_SHUFFLE(3, 1, 2, 0));
+  v_src[3] = _mm256_permute4x64_epi64(v_src_raw[3], _MM_SHUFFLE(3, 1, 2, 0));
+  
+  v_src[0] = _mm256_shuffle_epi8(v_src[0], v_shuffle);
+  v_src[1] = _mm256_shuffle_epi8(v_src[1], v_shuffle);
+  v_src[2] = _mm256_shuffle_epi8(v_src[2], v_shuffle);
+  v_src[3] = _mm256_shuffle_epi8(v_src[3], v_shuffle);
+
+  const __m256i* v_c_ptr = v_coeff;
+  __m256i v_madd_0[8];
+  __m256i v_madd_1[8];
+  __m256i v_madd_2[8];
+  __m256i v_madd_3[8];
+  for (int i = 0; i < 8; ++i) {
+    v_madd_0[i] = _mm256_madd_epi16(v_src[0], v_c_ptr[0]);
+    v_madd_1[i] = _mm256_madd_epi16(v_src[1], v_c_ptr[1]);
+    v_madd_2[i] = _mm256_madd_epi16(v_src[2], v_c_ptr[2]);
+    v_madd_3[i] = _mm256_madd_epi16(v_src[3], v_c_ptr[3]);
+    v_c_ptr += 4;
+  }
+
+  __m256i v_add[8];
+  for (int i = 0; i < 8; ++i) {
+    __m256i v_add_0 = _mm256_add_epi32(v_madd_0[i], v_madd_1[i]);
+    __m256i v_add_1 = _mm256_add_epi32(v_madd_2[i], v_madd_3[i]);
+
+    v_add[i] = _mm256_add_epi32(v_add_0, v_add_1);
+  }
+  
+  __m256i v_trunc[8];
+  for (int i = 0; i < 8; ++i) {
+    v_trunc[i] = truncate_avx2(v_add[i], debias, shift);
+  }
+
+  dst[0] = _mm256_packs_epi32(v_trunc[0], v_trunc[1]);
+  dst[1] = _mm256_packs_epi32(v_trunc[2], v_trunc[3]);
+  dst[2] = _mm256_packs_epi32(v_trunc[4], v_trunc[5]);
+  dst[3] = _mm256_packs_epi32(v_trunc[6], v_trunc[7]);
+}
+
+static void fast_inverse_tr_8x8_avx2_ver(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+  const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0415);
+
+  __m256i v_src[4];
+  v_src[0] = _mm256_shuffle_epi32(src[0], _MM_SHUFFLE(3, 1, 2, 0));
+  v_src[1] = _mm256_shuffle_epi32(src[1], _MM_SHUFFLE(3, 1, 2, 0));
+  v_src[2] = _mm256_shuffle_epi32(src[2], _MM_SHUFFLE(3, 1, 2, 0));
+  v_src[3] = _mm256_shuffle_epi32(src[3], _MM_SHUFFLE(3, 1, 2, 0));
+
+  __m256i v_tmp0 = _mm256_permute2x128_si256(v_src[0], v_src[1], 0x20);
+  __m256i v_tmp1 = _mm256_permute2x128_si256(v_src[0], v_src[1], 0x31);
+  __m256i v_tmp2 = _mm256_permute2x128_si256(v_src[2], v_src[3], 0x20);
+  __m256i v_tmp3 = _mm256_permute2x128_si256(v_src[2], v_src[3], 0x31);
+
+  v_src[0] = _mm256_unpacklo_epi64(v_tmp0, v_tmp2);
+  v_src[1] = _mm256_unpackhi_epi64(v_tmp0, v_tmp2);
+  v_src[2] = _mm256_unpacklo_epi64(v_tmp1, v_tmp3);
+  v_src[3] = _mm256_unpackhi_epi64(v_tmp1, v_tmp3);
+  
+
+  const __m256i* v_c_ptr = v_coeff;
+  __m256i v_madd_0[8];
+  __m256i v_madd_1[8];
+  __m256i v_madd_2[8];
+  __m256i v_madd_3[8];
+  for (int i = 0; i < 8; ++i) {
+    v_madd_0[i] = _mm256_madd_epi16(v_src[0], v_c_ptr[0]);
+    v_madd_1[i] = _mm256_madd_epi16(v_src[1], v_c_ptr[1]);
+    v_madd_2[i] = _mm256_madd_epi16(v_src[2], v_c_ptr[2]);
+    v_madd_3[i] = _mm256_madd_epi16(v_src[3], v_c_ptr[3]);
+    v_c_ptr += 4;
+  }
+
+  __m256i v_add[8];
+  for (int i = 0; i < 8; ++i) {
+    __m256i v_add_0 = _mm256_add_epi32(v_madd_0[i], v_madd_1[i]);
+    __m256i v_add_1 = _mm256_add_epi32(v_madd_2[i], v_madd_3[i]);
+
+    v_add[i] = _mm256_add_epi32(v_add_0, v_add_1);
+  }
+
+  __m256i v_trunc[8];
+  for (int i = 0; i < 8; ++i) {
+    v_trunc[i] = truncate_avx2(v_add[i], debias, shift);
+  }
+
+  __m256i v_result[4];
+  v_result[0] = _mm256_packs_epi32(v_trunc[0], v_trunc[1]);
+  v_result[1] = _mm256_packs_epi32(v_trunc[2], v_trunc[3]);
+  v_result[2] = _mm256_packs_epi32(v_trunc[4], v_trunc[5]);
+  v_result[3] = _mm256_packs_epi32(v_trunc[6], v_trunc[7]);
+
+  v_result[0] = _mm256_shuffle_epi8(v_result[0], v_res_shuffle);
+  v_result[1] = _mm256_shuffle_epi8(v_result[1], v_res_shuffle);
+  v_result[2] = _mm256_shuffle_epi8(v_result[2], v_res_shuffle);
+  v_result[3] = _mm256_shuffle_epi8(v_result[3], v_res_shuffle);
+
+  __m256i v_rtmp0 = _mm256_unpacklo_epi32(v_result[0], v_result[1]);
+  __m256i v_rtmp1 = _mm256_unpackhi_epi32(v_result[0], v_result[1]);
+  __m256i v_rtmp2 = _mm256_unpacklo_epi32(v_result[2], v_result[3]);
+  __m256i v_rtmp3 = _mm256_unpackhi_epi32(v_result[2], v_result[3]);
+
+  __m256i v_tmp20 = _mm256_unpacklo_epi64(v_rtmp0, v_rtmp2);
+  __m256i v_tmp21 = _mm256_unpackhi_epi64(v_rtmp0, v_rtmp2);
+  __m256i v_tmp22 = _mm256_unpacklo_epi64(v_rtmp1, v_rtmp3);
+  __m256i v_tmp23 = _mm256_unpackhi_epi64(v_rtmp1, v_rtmp3);
+
+  v_result[0] = _mm256_permute2x128_si256(v_tmp20, v_tmp21, 0x20);
+  v_result[1] = _mm256_permute2x128_si256(v_tmp20, v_tmp21, 0x31);
+  v_result[2] = _mm256_permute2x128_si256(v_tmp22, v_tmp23, 0x20);
+  v_result[3] = _mm256_permute2x128_si256(v_tmp22, v_tmp23, 0x31);
+
+  for (int i = 0; i < 4; ++i) {
+    _mm256_store_si256((__m256i*)dst, v_result[i]);
+    dst += 16;
+  }
+}
+
+void fast_inverse_tr_8x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 8;
+  const int height = 8;
+
+  int skip_width = 0;
+  int skip_height = 0;
+
+  const int32_t shift_1st = INVERSE_SHIFT_1ST;
+  const int32_t shift_2nd = INVERSE_SHIFT_2ND;
+
+  const int16_t* hor_coeff = fi_dct2_8x8_coeff_hor;
+  const int16_t* ver_coeff = fi_dct2_8x8_coeff_hor;
+  if (hor == DST7) {
+    hor_coeff = fi_dst7_8x8_coeff_hor;
+  } else if (hor == DCT8) {
+    hor_coeff = fi_dct8_8x8_coeff_hor;
+  }
+  if (ver == DST7) {
+    ver_coeff = fi_dst7_8x8_coeff_hor;
+  } else if (ver == DCT8) {
+    ver_coeff = fi_dct8_8x8_coeff_hor;
+  }
+
+  __m256i v_hor_pass_out[4];
+  fast_inverse_tr_8x8_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);
+
+  fast_inverse_tr_8x8_avx2_ver(v_hor_pass_out, dst, ver_coeff, shift_2nd, width, skip_width, skip_height);
+}
+
+
+void fast_forward_tr_8x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 8;
+  const int height = 16;
+  // TODO: might be able to get rid of skips in these tailored solutions
+  int skip_width = 0;
+  int skip_height = 0; // This is not used anywhere
+
+  const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1;
+  const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
+  const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8;
+  const int32_t shift_2nd = log2_height_minus1 + 7;
+
+  const int16_t* hor_coeff = ff_dct2_8xN_coeff_hor;
+  const int16_t* ver_coeff = ff_dct2_8x16_coeff_ver;
+  if (hor == DST7) {
+    hor_coeff = ff_dst7_8xN_coeff_hor;
+  } else if (hor == DCT8) {
+    hor_coeff = ff_dct8_8xN_coeff_hor;
+  }
+  if (ver == DST7) {
+    ver_coeff = ff_dst7_8x16_coeff_ver;
+  } else if (ver == DCT8) {
+    ver_coeff = ff_dct8_8x16_coeff_ver;
+  }
+
+  __m256i v_hor_pass_out[8];
+  fast_forward_tr_8xN_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);
+
+  // Vertical pass
+  const int32_t    add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  // Can use same shuffles as 8x4
+  const __m256i     v_shuffle = _mm256_load_si256((const __m256i*)ff_dct2_8x4_ver_pass_shuffle);
+  const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)ff_dct2_8x4_ver_pass_result_shuffle);
+  //const __m256i* v_coeff = (const __m256i*)ver_coeff;
+  const int32_t *line_coeff = (const int32_t*)ver_coeff;
+
+  // Multiply+add all source vectors with coeff vectors
+  __m256i v_madd[8][16];
+  __m256i* v_src_ptr = v_hor_pass_out;
+  for (int i = 0; i < 8; ++i) {
+    __m256i v_src_per = _mm256_permute4x64_epi64(v_src_ptr[0], _MM_SHUFFLE(3, 1, 2, 0));
+    // Weave lo and hi halfs of each 128 bit lane
+    __m256i     v_src = _mm256_shuffle_epi8(v_src_per, v_shuffle);
+    
+    for (int ii = 0; ii < 16; ++ii) {
+      //int coeff_row = ii * 8 + i;
+      const int32_t coeff = line_coeff[ii];
+      const __m256i v_coeff = _mm256_set1_epi32(coeff);
+      v_madd[i][ii] = _mm256_madd_epi16(v_src, v_coeff);
+    }
+    line_coeff += 16;
+    v_src_ptr += 1;
+  }
+
+  // Add vectors
+  __m256i v_add_0[4][16];
+  for (int i = 0; i < 4; ++i) {
+    for (int ii = 0; ii < 16; ++ii) {
+      int offset = i * 2;
+      v_add_0[i][ii] = _mm256_add_epi32(v_madd[offset][ii], v_madd[offset + 1][ii]);
+    }
+  }
+  // Second round of additions
+  __m256i v_add_1[2][16];
+  for (int i = 0; i < 2; ++i) {
+    for (int ii = 0; ii < 16; ++ii) {
+      int offset = i * 2;
+      v_add_1[i][ii] = _mm256_add_epi32(v_add_0[offset][ii], v_add_0[offset + 1][ii]);
+    }
+  }
+  // Third round of additions
+  __m256i v_trunc[16];
+  for (int ii = 0; ii < 16; ++ii) {
+    v_trunc[ii] = _mm256_add_epi32(v_add_1[0][ii], v_add_1[1][ii]);
+    v_trunc[ii] = truncate_avx2(v_trunc[ii], debias, shift_2nd);
+  }
+
+
+  for (int i = 0; i < 16; i += 2) {
+    __m256i v_result = _mm256_packs_epi32(v_trunc[i], v_trunc[i + 1]);
+
+    // Swap each middle 64 bit chunk in both 128 bit lanes
+    v_result = _mm256_permute4x64_epi64(v_result, _MM_SHUFFLE(3, 1, 2, 0));
+    // Swap each middle 16 bit value in each 64 bit chunk
+    v_result = _mm256_shuffle_epi8(v_result, v_res_shuffle);
+    
+    _mm256_store_si256((__m256i*)dst, v_result);
+    dst += 16;
+  }
+}
+
+
+static void fast_inverse_tr_8x16_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+  const __m256i* v_src_raw = (const __m256i*)src;
+  const __m256i v_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0415);
+
+  __m256i v_tmp[8];
+  for (int i = 0; i < 8; ++i) {
+    v_tmp[i] = _mm256_permute4x64_epi64(v_src_raw[i], _MM_SHUFFLE(3, 1, 2, 0));
+  }
+
+  __m256i v_src[8];
+  for (int i = 0; i < 8; ++i) {
+    v_src[i] = _mm256_shuffle_epi8(v_tmp[i], v_shuffle);
+  }
+
+  __m256i v_trunc[16];
+  for (int c = 0; c < 16; c++) {
+    __m256i v_madd_0 = _mm256_madd_epi16(v_src[0], v_coeff[0]);
+    __m256i v_madd_1 = _mm256_madd_epi16(v_src[1], v_coeff[1]);
+    __m256i v_madd_2 = _mm256_madd_epi16(v_src[2], v_coeff[2]);
+    __m256i v_madd_3 = _mm256_madd_epi16(v_src[3], v_coeff[3]);
+    __m256i v_madd_4 = _mm256_madd_epi16(v_src[4], v_coeff[4]);
+    __m256i v_madd_5 = _mm256_madd_epi16(v_src[5], v_coeff[5]);
+    __m256i v_madd_6 = _mm256_madd_epi16(v_src[6], v_coeff[6]);
+    __m256i v_madd_7 = _mm256_madd_epi16(v_src[7], v_coeff[7]);
+
+    v_coeff += 8;
+
+    __m256i v_add_00 = _mm256_add_epi32(v_madd_0, v_madd_1);
+    __m256i v_add_01 = _mm256_add_epi32(v_madd_2, v_madd_3);
+    __m256i v_add_02 = _mm256_add_epi32(v_madd_4, v_madd_5);
+    __m256i v_add_03 = _mm256_add_epi32(v_madd_6, v_madd_7);
+
+    __m256i v_add_10 = _mm256_add_epi32(v_add_00, v_add_01);
+    __m256i v_add_11 = _mm256_add_epi32(v_add_02, v_add_03);
+
+    v_trunc[c] = truncate_avx2(_mm256_add_epi32(v_add_10, v_add_11), debias, shift);
+  }
+
+  for (int d = 0, s = 0; d < 8; ++d, s += 2) {
+    dst[d] = _mm256_packs_epi32(v_trunc[s + 0], v_trunc[s + 1]);
+  }
+}
+
+static void fast_inverse_tr_8x16_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+  const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0415);
+
+  __m256i v_src[8];
+  for (int i = 0; i < 8; ++i) {
+    v_src[i] = _mm256_shuffle_epi32(src[i], _MM_SHUFFLE(3, 1, 2, 0));
+  }
+
+  __m256i v_tmp[8];
+  v_tmp[0] = _mm256_permute2x128_si256(v_src[0], v_src[1], 0x20);
+  v_tmp[1] = _mm256_permute2x128_si256(v_src[2], v_src[3], 0x20);
+  v_tmp[2] = _mm256_permute2x128_si256(v_src[4], v_src[5], 0x20);
+  v_tmp[3] = _mm256_permute2x128_si256(v_src[6], v_src[7], 0x20);
+  v_tmp[4] = _mm256_permute2x128_si256(v_src[0], v_src[1], 0x31);
+  v_tmp[5] = _mm256_permute2x128_si256(v_src[2], v_src[3], 0x31);
+  v_tmp[6] = _mm256_permute2x128_si256(v_src[4], v_src[5], 0x31);
+  v_tmp[7] = _mm256_permute2x128_si256(v_src[6], v_src[7], 0x31);
+
+  v_src[0] = _mm256_unpacklo_epi32(v_tmp[0], v_tmp[1]);
+  v_src[1] = _mm256_unpackhi_epi32(v_tmp[0], v_tmp[1]);
+  v_src[2] = _mm256_unpacklo_epi32(v_tmp[4], v_tmp[5]);
+  v_src[3] = _mm256_unpackhi_epi32(v_tmp[4], v_tmp[5]);
+  v_src[4] = _mm256_unpacklo_epi32(v_tmp[2], v_tmp[3]);
+  v_src[5] = _mm256_unpackhi_epi32(v_tmp[2], v_tmp[3]);
+  v_src[6] = _mm256_unpacklo_epi32(v_tmp[6], v_tmp[7]);
+  v_src[7] = _mm256_unpackhi_epi32(v_tmp[6], v_tmp[7]);
+
+  __m256i v_trunc[2][8];
+  for (int d = 0, s = 0; d < 2; ++d, s += 4) {
+    const __m256i* v_c_ptr = v_coeff;
+    __m256i v_madd_0[8];
+    __m256i v_madd_1[8];
+    __m256i v_madd_2[8];
+    __m256i v_madd_3[8];
+    for (int c = 0; c < 8; ++c) {
+      v_madd_0[c] = _mm256_madd_epi16(v_src[s + 0], v_c_ptr[0]);
+      v_madd_1[c] = _mm256_madd_epi16(v_src[s + 1], v_c_ptr[1]);
+      v_madd_2[c] = _mm256_madd_epi16(v_src[s + 2], v_c_ptr[2]);
+      v_madd_3[c] = _mm256_madd_epi16(v_src[s + 3], v_c_ptr[3]);
+      v_c_ptr += 4;
+    }
+
+    for (int i = 0; i < 8; ++i) {
+      __m256i v_add_0 = _mm256_add_epi32(v_madd_0[i], v_madd_1[i]);
+      __m256i v_add_1 = _mm256_add_epi32(v_madd_2[i], v_madd_3[i]);
+
+      v_trunc[d][i] = truncate_avx2(_mm256_add_epi32(v_add_0, v_add_1), debias, shift);
+    }
+  }
+
+  __m256i v_rtmp[8];
+  v_rtmp[0] = _mm256_packs_epi32(v_trunc[0][0], v_trunc[0][1]);
+  v_rtmp[1] = _mm256_packs_epi32(v_trunc[0][2], v_trunc[0][3]);
+  v_rtmp[2] = _mm256_packs_epi32(v_trunc[0][4], v_trunc[0][5]);
+  v_rtmp[3] = _mm256_packs_epi32(v_trunc[0][6], v_trunc[0][7]);
+  v_rtmp[4] = _mm256_packs_epi32(v_trunc[1][0], v_trunc[1][1]);
+  v_rtmp[5] = _mm256_packs_epi32(v_trunc[1][2], v_trunc[1][3]);
+  v_rtmp[6] = _mm256_packs_epi32(v_trunc[1][4], v_trunc[1][5]);
+  v_rtmp[7] = _mm256_packs_epi32(v_trunc[1][6], v_trunc[1][7]);
+
+  for (int i = 0; i < 8; ++i) {
+    v_rtmp[i] = _mm256_shuffle_epi8(v_rtmp[i], v_res_shuffle);
+  }
+
+  __m256i v_tmp32_lo0 = _mm256_unpacklo_epi32(v_rtmp[0], v_rtmp[1]);
+  __m256i v_tmp32_lo1 = _mm256_unpacklo_epi32(v_rtmp[2], v_rtmp[3]);
+  __m256i v_tmp32_lo2 = _mm256_unpacklo_epi32(v_rtmp[4], v_rtmp[5]);
+  __m256i v_tmp32_lo3 = _mm256_unpacklo_epi32(v_rtmp[6], v_rtmp[7]);
+
+  __m256i v_tmp32_hi0 = _mm256_unpackhi_epi32(v_rtmp[0], v_rtmp[1]);
+  __m256i v_tmp32_hi1 = _mm256_unpackhi_epi32(v_rtmp[2], v_rtmp[3]);
+  __m256i v_tmp32_hi2 = _mm256_unpackhi_epi32(v_rtmp[4], v_rtmp[5]);
+  __m256i v_tmp32_hi3 = _mm256_unpackhi_epi32(v_rtmp[6], v_rtmp[7]);
+
+  __m256i v_tmp64_lo0 = _mm256_unpacklo_epi64(v_tmp32_lo0, v_tmp32_lo1);
+  __m256i v_tmp64_lo1 = _mm256_unpacklo_epi64(v_tmp32_hi0, v_tmp32_hi1);
+  __m256i v_tmp64_lo2 = _mm256_unpacklo_epi64(v_tmp32_lo2, v_tmp32_lo3);
+  __m256i v_tmp64_lo3 = _mm256_unpacklo_epi64(v_tmp32_hi2, v_tmp32_hi3);
+
+  __m256i v_tmp64_hi0 = _mm256_unpackhi_epi64(v_tmp32_lo0, v_tmp32_lo1);
+  __m256i v_tmp64_hi1 = _mm256_unpackhi_epi64(v_tmp32_hi0, v_tmp32_hi1);
+  __m256i v_tmp64_hi2 = _mm256_unpackhi_epi64(v_tmp32_lo2, v_tmp32_lo3);
+  __m256i v_tmp64_hi3 = _mm256_unpackhi_epi64(v_tmp32_hi2, v_tmp32_hi3);
+
+  __m256i v_result[8];
+  v_result[0] = _mm256_permute2x128_si256(v_tmp64_lo0, v_tmp64_lo1, 0x20);
+  v_result[1] = _mm256_permute2x128_si256(v_tmp64_lo0, v_tmp64_lo1, 0x31);
+  v_result[2] = _mm256_permute2x128_si256(v_tmp64_hi0, v_tmp64_hi1, 0x20);
+  v_result[3] = _mm256_permute2x128_si256(v_tmp64_hi0, v_tmp64_hi1, 0x31);
+  v_result[4] = _mm256_permute2x128_si256(v_tmp64_lo2, v_tmp64_lo3, 0x20);
+  v_result[5] = _mm256_permute2x128_si256(v_tmp64_lo2, v_tmp64_lo3, 0x31);
+  v_result[6] = _mm256_permute2x128_si256(v_tmp64_hi2, v_tmp64_hi3, 0x20);
+  v_result[7] = _mm256_permute2x128_si256(v_tmp64_hi2, v_tmp64_hi3, 0x31);
+
+  for (int i = 0; i < 8; ++i) {
+    _mm256_store_si256((__m256i*)dst, v_result[i]);
+    dst += 16;
+  }
+}
+
+void fast_inverse_tr_8x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 8;
+  const int height = 16;
+
+  int skip_width = 0;
+  int skip_height = 0;
+
+  const int32_t shift_1st = INVERSE_SHIFT_1ST;
+  const int32_t shift_2nd = INVERSE_SHIFT_2ND;
+
+  const int16_t* ver_coeff = fi_dct2_16x8_coeff_hor; // TODO: rename coeff tables
+  const int16_t* hor_coeff = fi_dct2_16x8_coeff_ver;
+  if (hor == DST7) {
+    hor_coeff = fi_dst7_16x8_coeff_ver;
+  } else if (hor == DCT8) {
+    hor_coeff = fi_dct8_16x8_coeff_ver;
+  }
+  if (ver == DST7) {
+    ver_coeff = fi_dst7_16x8_coeff_hor;
+  } else if (ver == DCT8) {
+    ver_coeff = fi_dct8_16x8_coeff_hor;
+  }
+
+  __m256i v_ver_pass_out[8];
+  fast_inverse_tr_8x16_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height);
+  
+  fast_inverse_tr_8x16_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width);
+}
+
+
+void fast_forward_tr_8x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 8;
+  const int height = 32;
+
+  int skip_width = 0;
+  int skip_height = (ver != DCT2 && height == 32) ? 16 : (height > 32 ? height - 32 : 0);
+
+  const int  reduced_line = width - skip_width;
+  const int  cutoff = height - skip_height;
+  int16_t* p_dst = dst;
+
+  const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1;
+  const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
+  const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8;
+  const int32_t shift_2nd = log2_height_minus1 + 7;
+
+  const int16_t* hor_coeff = ff_dct2_8xN_coeff_hor;
+  const int16_t* ver_coeff = ff_dct2_8x32_coeff_ver;
+  if (hor == DST7) {
+    hor_coeff = ff_dst7_8xN_coeff_hor;
+  } else if (hor == DCT8) {
+    hor_coeff = ff_dct8_8xN_coeff_hor;
+  }
+  if (ver == DST7) {
+    ver_coeff = ff_dst7_8x32_coeff_ver;
+  } else if (ver == DCT8) {
+    ver_coeff = ff_dct8_8x32_coeff_ver;
+  }
+
+  ALIGNED(32) int16_t v_hor_pass_out[8 * 32];
+  fast_forward_tr_8xN_avx2_hor(src, (__m256i *)v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);
+
+  __m256i temp_out[16];
+  // Vertical pass
+  const int32_t    add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  for (int j = 0; j < 8; ++j) {
+    __m256i res_0 = _mm256_setzero_si256();
+    __m256i res_1 = _mm256_setzero_si256();
+    __m256i res_2 = _mm256_setzero_si256();
+    __m256i res_3 = _mm256_setzero_si256();
+    const int16_t* coeff_start = ver_coeff;
+    for (int i = 0; i < 16; ++i) {
+      int16_t source[2];
+      source[0] = v_hor_pass_out[j + i * 16];
+      source[1] = v_hor_pass_out[j + i * 16 + 8];
+      int32_t paired_source;
+      memcpy(&paired_source, source, sizeof(int32_t));
+
+      __m256i v_src = _mm256_set1_epi32(paired_source);
+      __m256i v_coeff_0 = _mm256_load_si256((__m256i*) coeff_start);
+      coeff_start += 16;
+      __m256i v_coeff_1 = _mm256_load_si256((__m256i*) coeff_start);
+      coeff_start += 16;
+      __m256i v_coeff_2 = _mm256_load_si256((__m256i*) coeff_start);
+      coeff_start += 16;
+      __m256i v_coeff_3 = _mm256_load_si256((__m256i*) coeff_start);
+      coeff_start += 16;
+
+      __m256i madd_0 = _mm256_madd_epi16(v_src, v_coeff_0);
+      __m256i madd_1 = _mm256_madd_epi16(v_src, v_coeff_1);
+      __m256i madd_2 = _mm256_madd_epi16(v_src, v_coeff_2);
+      __m256i madd_3 = _mm256_madd_epi16(v_src, v_coeff_3);
+
+      res_0 = _mm256_add_epi32(res_0, madd_0);
+      res_1 = _mm256_add_epi32(res_1, madd_1);
+      res_2 = _mm256_add_epi32(res_2, madd_2);
+      res_3 = _mm256_add_epi32(res_3, madd_3);
+    }
+    __m256i v_trunc_0 = truncate_avx2(res_0, debias, shift_2nd);
+    __m256i v_trunc_1 = truncate_avx2(res_1, debias, shift_2nd);
+    __m256i v_trunc_2 = truncate_avx2(res_2, debias, shift_2nd);
+    __m256i v_trunc_3 = truncate_avx2(res_3, debias, shift_2nd);
+
+    v_trunc_0 = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+    v_trunc_1 = _mm256_packs_epi32(v_trunc_2, v_trunc_3);
+    v_trunc_0 = _mm256_permute4x64_epi64(v_trunc_0, _MM_SHUFFLE(3, 1, 2, 0));
+    v_trunc_1 = _mm256_permute4x64_epi64(v_trunc_1, _MM_SHUFFLE(3, 1, 2, 0));
+    _mm256_store_si256(temp_out + 2 * j, v_trunc_0);
+    _mm256_store_si256(temp_out + 2 * j + 1, v_trunc_1);
+  }
+  transpose_avx2(temp_out, (__m256i*) dst, 32, 8);
+#undef NUM_PARTS
+#undef PART_DIMENSION
+  if (skip_width) {
+    dst = p_dst + reduced_line;
+    for (int j = 0; j < cutoff; j++)
+    {
+      memset(dst, 0, sizeof(int16_t) * skip_width);
+      dst += width;
+    }
+  }
+
+  if (skip_height) {
+    dst = p_dst + width * cutoff;
+    memset(dst, 0, sizeof(int16_t) * width * skip_height);
+  }
+}
+
+
+static void fast_inverse_tr_8x32_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const int32_t* c_ptr = (const int32_t*)coeff; // Handle as 32 bit integer to load two coeffs into vector at the same time
+  const __m256i* v_src_raw = (const __m256i*)src;
+
+  __m256i v_tmp[16];
+  for (int i = 0; i < 16; i += 2) {
+    v_tmp[i + 0] = _mm256_permute2x128_si256(v_src_raw[i + 0], v_src_raw[i + 1], 0x20);
+    v_tmp[i + 1] = _mm256_permute2x128_si256(v_src_raw[i + 0], v_src_raw[i + 1], 0x31);
+  }
+
+  __m256i v_tmp16_lo[8];
+  __m256i v_tmp16_hi[8];
+  for (int d = 0, s = 0; d < 8; ++d, s += 2) {
+    v_tmp16_lo[d] = _mm256_unpacklo_epi16(v_tmp[s + 0], v_tmp[s + 1]);
+    v_tmp16_hi[d] = _mm256_unpackhi_epi16(v_tmp[s + 0], v_tmp[s + 1]);
+  }
+
+  __m256i v_src[16];
+  for (int d = 0, s = 0; d < 16; d += 2, ++s) {
+    v_src[d + 0] = _mm256_permute2x128_si256(v_tmp16_lo[s], v_tmp16_hi[s], 0x20);
+    v_src[d + 1] = _mm256_permute2x128_si256(v_tmp16_lo[s], v_tmp16_hi[s], 0x31);
+  }
+
+  __m256i v_trunc[32];
+
+  for (int row = 0; row < 32; ++row) {
+    __m256i v_res = _mm256_setzero_si256();
+    for (int i = 0; i < 16; ++i) {
+      __m256i v_coeff = _mm256_set1_epi32(*c_ptr);
+      __m256i v_madd = _mm256_madd_epi16(v_src[i], v_coeff);
+      v_res = _mm256_add_epi32(v_res, v_madd);
+      c_ptr++;
+    }
+
+    v_trunc[row] = truncate_avx2(v_res, debias, shift);
+  }
+
+  for (int d = 0, s = 0; d < 16; ++d, s += 2) {
+    dst[d] = _mm256_packs_epi32(v_trunc[s + 0], v_trunc[s + 1]);
+  }
+}
+
+static void fast_inverse_tr_8x32_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff; // Handle as 32 bit integer to load two coeffs into vector at the same time
+  const __m256i* v_src_raw = src;
+  const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0246);
+
+  __m256i v_src[16];
+  for (int i = 0; i < 16; i += 2) {
+    v_src[i + 0] = _mm256_permute2x128_si256(v_src_raw[i + 0], v_src_raw[i + 1], 0x20);
+    v_src[i + 1] = _mm256_permute2x128_si256(v_src_raw[i + 0], v_src_raw[i + 1], 0x31);
+  }
+
+  __m256i v_tmp[16];
+  for (int s = 0; s < 16; s += 2) {
+    __m256i v_add[8];
+    for (int d = 0, c = 0; d < 8; ++d, c += 2) {
+      __m256i v_madd_0 = _mm256_madd_epi16(v_src[s + 0], v_coeff[c + 0]);
+      __m256i v_madd_1 = _mm256_madd_epi16(v_src[s + 1], v_coeff[c + 1]);
+
+      v_add[d] = _mm256_add_epi32(v_madd_0, v_madd_1);
+    }
+
+    __m256i v_hadd[4];
+    v_hadd[0] = _mm256_hadd_epi32(v_add[0], v_add[1]);
+    v_hadd[1] = _mm256_hadd_epi32(v_add[2], v_add[3]);
+    v_hadd[2] = _mm256_hadd_epi32(v_add[4], v_add[5]);
+    v_hadd[3] = _mm256_hadd_epi32(v_add[6], v_add[7]);
+
+    __m256i v_trunc[4];
+    v_trunc[0] = truncate_avx2(v_hadd[0], debias, shift);
+    v_trunc[1] = truncate_avx2(v_hadd[1], debias, shift);
+    v_trunc[2] = truncate_avx2(v_hadd[2], debias, shift);
+    v_trunc[3] = truncate_avx2(v_hadd[3], debias, shift);
+
+    v_tmp[s + 0] = _mm256_packs_epi32(v_trunc[0], v_trunc[1]);
+    v_tmp[s + 1] = _mm256_packs_epi32(v_trunc[2], v_trunc[3]);
+  }
+
+  for (int i = 0; i < 16; ++i) {
+    v_tmp[i] = _mm256_shuffle_epi8(v_tmp[i], v_res_shuffle);
+  }
+
+  __m256i v_tmp64_lo[8];
+  __m256i v_tmp64_hi[8];
+  for (int d = 0, s = 0; d < 8; ++d, s += 2) {
+    v_tmp64_lo[d] = _mm256_unpacklo_epi64(v_tmp[s + 0], v_tmp[s + 1]);
+    v_tmp64_hi[d] = _mm256_unpackhi_epi64(v_tmp[s + 0], v_tmp[s + 1]);
+  }
+
+  __m256i v_result[16];
+  for (int d = 0, s = 0; d < 16; d += 2, ++s) {
+    v_result[d + 0] = _mm256_permute2x128_si256(v_tmp64_lo[s], v_tmp64_hi[s], 0x20);
+    v_result[d + 1] = _mm256_permute2x128_si256(v_tmp64_lo[s], v_tmp64_hi[s], 0x31);
+  }
+
+  for (int i = 0; i < 16; ++i) {
+    _mm256_store_si256((__m256i*)dst, v_result[i]);
+    dst += 16;
+  }
+
+  // TODO: mts cutoff
+}
+
+void fast_inverse_tr_8x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 8;
+  const int height = 32;
+
+  int skip_width = 0;
+  int skip_height = (ver != DCT2 && height == 32) ? 16 : (height > 32 ? height - 32 : 0);
+
+  const int32_t shift_1st = INVERSE_SHIFT_1ST;
+  const int32_t shift_2nd = INVERSE_SHIFT_2ND;
+
+  const int16_t* ver_coeff = uvg_g_dct_32_t;
+  const int16_t* hor_coeff = fi_dct2_32x8_coeff_ver; // TODO: rename table
+  if (hor == DST7) {
+    hor_coeff = fi_dst7_32x8_coeff_ver; // TODO: rename
+  } else if (hor == DCT8) {
+    hor_coeff = fi_dct8_32x8_coeff_ver; // TODO: rename
+  }
+  if (ver == DST7) {
+    ver_coeff = uvg_g_dst7_32_t;
+  } else if (ver == DCT8) {
+    ver_coeff = uvg_g_dct8_32;
+  }
+
+  __m256i v_ver_pass_out[16];
+  fast_inverse_tr_8x32_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height);
+
+  fast_inverse_tr_8x32_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width);
+}
+
+
+static void fast_forward_DCT2_B16_avx2_hor(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = (shift > 0) ? (1 << (shift - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  // ISP_TODO: might be faster to load these from arrays
+  const __m256i v_permute_0 = _mm256_set1_epi32(0);
+  const __m256i v_permute_1 = _mm256_set1_epi32(1);
+  const __m256i v_permute_2 = _mm256_set1_epi32(2);
+  const __m256i v_permute_3 = _mm256_set1_epi32(3);
+  const __m256i v_permute_4 = _mm256_set1_epi32(4);
+  const __m256i v_permute_5 = _mm256_set1_epi32(5);
+  const __m256i v_permute_6 = _mm256_set1_epi32(6);
+  const __m256i v_permute_7 = _mm256_set1_epi32(7);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+
+  const int reduced_line = line - skip_line;
+  // Handle 1 line at a time, 16 samples per line
+  for (int j = 0; j < reduced_line; ++j) {
+    //                    line 1
+    // src vector:       [s00 s01 s02 s03 s04 s05 s06 s07 | s08 s09 s10 s11 s12 s13 s14 s15]
+    __m256i v_src_raw = _mm256_load_si256((const __m256i*)src);
+
+    // Arrange data so calculations can be done column-wise (to avoid using hadds).
+    // Need 8 source vectors. First will be filled with s00 and s01 pairs. Second with s02 and s03 pairs and so on
+    __m256i v_src_0 = _mm256_permutevar8x32_epi32(v_src_raw, v_permute_0);
+    __m256i v_src_1 = _mm256_permutevar8x32_epi32(v_src_raw, v_permute_1);
+    __m256i v_src_2 = _mm256_permutevar8x32_epi32(v_src_raw, v_permute_2);
+    __m256i v_src_3 = _mm256_permutevar8x32_epi32(v_src_raw, v_permute_3);
+    __m256i v_src_4 = _mm256_permutevar8x32_epi32(v_src_raw, v_permute_4);
+    __m256i v_src_5 = _mm256_permutevar8x32_epi32(v_src_raw, v_permute_5);
+    __m256i v_src_6 = _mm256_permutevar8x32_epi32(v_src_raw, v_permute_6);
+    __m256i v_src_7 = _mm256_permutevar8x32_epi32(v_src_raw, v_permute_7);
+
+    __m256i v_madd_0_00 = _mm256_madd_epi16(v_src_0, v_coeff[0]);
+    __m256i v_madd_0_01 = _mm256_madd_epi16(v_src_1, v_coeff[1]);
+    __m256i v_madd_0_02 = _mm256_madd_epi16(v_src_2, v_coeff[2]);
+    __m256i v_madd_0_03 = _mm256_madd_epi16(v_src_3, v_coeff[3]);
+    __m256i v_madd_0_04 = _mm256_madd_epi16(v_src_4, v_coeff[4]);
+    __m256i v_madd_0_05 = _mm256_madd_epi16(v_src_5, v_coeff[5]);
+    __m256i v_madd_0_06 = _mm256_madd_epi16(v_src_6, v_coeff[6]);
+    __m256i v_madd_0_07 = _mm256_madd_epi16(v_src_7, v_coeff[7]);
+
+    __m256i v_madd_0_08 = _mm256_madd_epi16(v_src_0, v_coeff[8]);
+    __m256i v_madd_0_09 = _mm256_madd_epi16(v_src_1, v_coeff[9]);
+    __m256i v_madd_0_10 = _mm256_madd_epi16(v_src_2, v_coeff[10]);
+    __m256i v_madd_0_11 = _mm256_madd_epi16(v_src_3, v_coeff[11]);
+    __m256i v_madd_0_12 = _mm256_madd_epi16(v_src_4, v_coeff[12]);
+    __m256i v_madd_0_13 = _mm256_madd_epi16(v_src_5, v_coeff[13]);
+    __m256i v_madd_0_14 = _mm256_madd_epi16(v_src_6, v_coeff[14]);
+    __m256i v_madd_0_15 = _mm256_madd_epi16(v_src_7, v_coeff[15]);
+
+    __m256i  v_madd_1_0 = _mm256_add_epi32(v_madd_0_00, v_madd_0_01);
+    __m256i  v_madd_1_1 = _mm256_add_epi32(v_madd_0_02, v_madd_0_03);
+    __m256i  v_madd_1_2 = _mm256_add_epi32(v_madd_0_04, v_madd_0_05);
+    __m256i  v_madd_1_3 = _mm256_add_epi32(v_madd_0_06, v_madd_0_07);
+    __m256i  v_madd_1_4 = _mm256_add_epi32(v_madd_0_08, v_madd_0_09);
+    __m256i  v_madd_1_5 = _mm256_add_epi32(v_madd_0_10, v_madd_0_11);
+    __m256i  v_madd_1_6 = _mm256_add_epi32(v_madd_0_12, v_madd_0_13);
+    __m256i  v_madd_1_7 = _mm256_add_epi32(v_madd_0_14, v_madd_0_15);
+
+    __m256i  v_madd_2_0 = _mm256_add_epi32(v_madd_1_0, v_madd_1_1);
+    __m256i  v_madd_2_1 = _mm256_add_epi32(v_madd_1_2, v_madd_1_3);
+    __m256i  v_madd_2_2 = _mm256_add_epi32(v_madd_1_4, v_madd_1_5);
+    __m256i  v_madd_2_3 = _mm256_add_epi32(v_madd_1_6, v_madd_1_7);
+
+    __m256i  v_madd_3_0 = _mm256_add_epi32(v_madd_2_0, v_madd_2_1);
+    __m256i  v_madd_3_1 = _mm256_add_epi32(v_madd_2_2, v_madd_2_3);
+
+    __m256i v_trunc_0 = truncate_avx2(v_madd_3_0, debias, shift);
+    __m256i v_trunc_1 = truncate_avx2(v_madd_3_1, debias, shift);
+
+    __m256i v_result = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+
+    dst[0] = v_result;
+    
+    src += 16;
+    dst++;
+  }
+}
+
+void fast_forward_tr_16x2_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 16;
+  const int height = 2;
+  // TODO: might be able to get rid of skips in these tailored solutions
+  int skip_width = 0;
+  int skip_height = 0; // This is not used anywhere
+
+  const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1;
+  const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
+  const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8;
+  const int32_t shift_2nd = log2_height_minus1 + 7;
+
+  const int16_t* hor_coeff = ff_dct2_16xN_coeff_hor;
+  const int16_t* ver_coeff = ff_dct2_16x2_coeff_ver;
+  if (hor == DST7) {
+    hor_coeff = ff_dst7_16xN_coeff_hor;    
+  }
+
+  __m256i v_hor_pass_out[2];
+  fast_forward_DCT2_B16_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);
+
+  // Vertical pass
+  const int32_t    add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)ver_coeff;
+
+  // Got samples for 2 source vectors
+  // Unpack -> samples to be added are adjacent
+  __m256i v_src_hi = _mm256_unpackhi_epi16(v_hor_pass_out[0], v_hor_pass_out[1]);
+  __m256i v_src_lo = _mm256_unpacklo_epi16(v_hor_pass_out[0], v_hor_pass_out[1]);
+
+  __m256i v_madd_hi_0 = _mm256_madd_epi16(v_src_hi, v_coeff[0]);
+  __m256i v_madd_hi_1 = _mm256_madd_epi16(v_src_hi, v_coeff[1]);
+  __m256i v_madd_lo_0 = _mm256_madd_epi16(v_src_lo, v_coeff[0]);
+  __m256i v_madd_lo_1 = _mm256_madd_epi16(v_src_lo, v_coeff[1]);
+
+  __m256i v_trunc_hi_0 = truncate_avx2(v_madd_hi_0, debias, shift_2nd);
+  __m256i v_trunc_hi_1 = truncate_avx2(v_madd_hi_1, debias, shift_2nd);
+  __m256i v_trunc_lo_0 = truncate_avx2(v_madd_lo_0, debias, shift_2nd);
+  __m256i v_trunc_lo_1 = truncate_avx2(v_madd_lo_1, debias, shift_2nd);
+
+  __m256i v_result_0 = _mm256_packs_epi32(v_trunc_lo_0, v_trunc_hi_0);
+  __m256i v_result_1 = _mm256_packs_epi32(v_trunc_lo_1, v_trunc_hi_1);
+
+  _mm256_store_si256((__m256i*)dst, v_result_0);
+  _mm256_store_si256((__m256i*)(dst + 16), v_result_1);
+}
+
+
+static void fast_inverse_tr_16x2_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i v_coeff_0 = _mm256_load_si256((const __m256i*) & coeff[0]);
+  const __m256i v_coeff_1 = _mm256_load_si256((const __m256i*) & coeff[16]);
+
+  const __m256i v_src_0 = _mm256_load_si256((const __m256i*) & src[0]);
+  const __m256i v_src_1 = _mm256_load_si256((const __m256i*) & src[16]);
+
+  const __m256i v_src_lo = _mm256_unpacklo_epi16(v_src_0, v_src_1);
+  const __m256i v_src_hi = _mm256_unpackhi_epi16(v_src_0, v_src_1);
+
+  __m256i v_trunc_0 = truncate_avx2(_mm256_madd_epi16(v_src_lo, v_coeff_0), debias, shift);
+  __m256i v_trunc_1 = truncate_avx2(_mm256_madd_epi16(v_src_lo, v_coeff_1), debias, shift);
+  __m256i v_trunc_2 = truncate_avx2(_mm256_madd_epi16(v_src_hi, v_coeff_0), debias, shift);
+  __m256i v_trunc_3 = truncate_avx2(_mm256_madd_epi16(v_src_hi, v_coeff_1), debias, shift);
+
+  dst[0] = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+  dst[1] = _mm256_packs_epi32(v_trunc_2, v_trunc_3);
+}
+
+static void fast_inverse_tr_16x2_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = (shift > 0) ? (1 << (shift - 1)) : 0;
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+
+  __m256i v_madd_e[16];
+  __m256i v_madd_o[16];
+  for (int i = 0, c = 0; i < 16; ++i, c += 2) {
+    v_madd_e[i] = _mm256_madd_epi16(src[0], v_coeff[c + 0]);
+    v_madd_o[i] = _mm256_madd_epi16(src[1], v_coeff[c + 1]);
+  }
+
+  __m256i v_add[16];
+  for (int i = 0; i < 16; ++i) {
+    v_add[i] = _mm256_add_epi32(v_madd_e[i], v_madd_o[i]);
+  }
+
+  for (int i = 0; i < 16; ++i) {
+    v_add[i] = _mm256_permute4x64_epi64(v_add[i], _MM_SHUFFLE(3, 1, 2, 0));
+  }
+
+  __m256i v_hadd_0[8];
+  for (int src = 0, dst = 0; dst < 8; ++dst, src += 2) {
+    v_hadd_0[dst] = _mm256_hadd_epi32(v_add[src + 0], v_add[src + 1]);
+  }
+
+  __m256i v_trunc[4];
+  for (int src = 0, dst = 0; dst < 4; ++dst, src += 2) {
+    v_trunc[dst] = truncate_avx2(_mm256_hadd_epi32(v_hadd_0[src + 0], v_hadd_0[src + 1]), debias, shift);
+  }
+
+  __m256i v_tmp0 = _mm256_packs_epi32(v_trunc[0], v_trunc[1]);
+  __m256i v_tmp1 = _mm256_packs_epi32(v_trunc[2], v_trunc[3]);
+
+  __m256i v_result_0 = _mm256_permute2x128_si256(v_tmp0, v_tmp1, 0x20);
+  __m256i v_result_1 = _mm256_permute2x128_si256(v_tmp0, v_tmp1, 0x31);
+
+  _mm256_store_si256((__m256i*) & dst[0], v_result_0);
+  _mm256_store_si256((__m256i*) & dst[16], v_result_1);
+}
+
+void fast_inverse_tr_16x2_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 16;
+  const int height = 2;
+
+  int skip_width = 0;
+  int skip_height = 0;
+
+  const int32_t shift_1st = INVERSE_SHIFT_1ST;
+  const int32_t shift_2nd = INVERSE_SHIFT_2ND;
+
+  const int16_t* ver_coeff = ff_dct2_2xN_coeff_hor; // TODO: rename
+  const int16_t* hor_coeff = fi_dct2_2x16_coeff_ver; // rename
+  // DST7 and DCT8 are not defined for this block size
+
+  __m256i v_ver_pass_out[2];
+  fast_inverse_tr_16x2_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height);
+
+  fast_inverse_tr_16x2_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width);
+}
+
+
+void fast_forward_tr_16x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 16;
+  const int height = 4;
+  // TODO: might be able to get rid of skips in these tailored solutions
+  int skip_width = 0;
+  int skip_height = 0; // This is not used anywhere
+
+  const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1;
+  const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
+  const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8;
+  const int32_t shift_2nd = log2_height_minus1 + 7;
+
+  const int16_t* hor_coeff = ff_dct2_16xN_coeff_hor;
+  const int16_t* ver_coeff = ff_dct2_16x4_coeff_ver;
+  if (hor == DST7) {
+    hor_coeff = ff_dst7_16xN_coeff_hor;
+  } else if (hor == DCT8) {
+    hor_coeff = ff_dct8_16xN_coeff_hor;
+  }
+  if (ver == DST7) {
+    ver_coeff = ff_dst7_16x4_coeff_ver;
+  } else if (ver == DCT8) {
+    ver_coeff = ff_dct8_16x4_coeff_ver;
+  }
+
+  __m256i v_hor_pass_out[4];
+  fast_forward_DCT2_B16_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);
+
+  // Vertical pass
+  const int32_t    add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)ver_coeff;
+
+  // Got samples for 4 vectors
+  __m256i v_src_hi_0 = _mm256_unpackhi_epi16(v_hor_pass_out[0], v_hor_pass_out[1]);
+  __m256i v_src_hi_1 = _mm256_unpackhi_epi16(v_hor_pass_out[2], v_hor_pass_out[3]);
+  __m256i v_src_lo_0 = _mm256_unpacklo_epi16(v_hor_pass_out[0], v_hor_pass_out[1]);
+  __m256i v_src_lo_1 = _mm256_unpacklo_epi16(v_hor_pass_out[2], v_hor_pass_out[3]);
+
+  __m256i v_madd_hi_00 = _mm256_madd_epi16(v_src_hi_0, v_coeff[0]);
+  __m256i v_madd_hi_01 = _mm256_madd_epi16(v_src_hi_0, v_coeff[2]);
+  __m256i v_madd_hi_02 = _mm256_madd_epi16(v_src_hi_0, v_coeff[4]);
+  __m256i v_madd_hi_03 = _mm256_madd_epi16(v_src_hi_0, v_coeff[6]);
+  __m256i v_madd_hi_10 = _mm256_madd_epi16(v_src_hi_1, v_coeff[1]);
+  __m256i v_madd_hi_11 = _mm256_madd_epi16(v_src_hi_1, v_coeff[3]);
+  __m256i v_madd_hi_12 = _mm256_madd_epi16(v_src_hi_1, v_coeff[5]);
+  __m256i v_madd_hi_13 = _mm256_madd_epi16(v_src_hi_1, v_coeff[7]);
+
+  __m256i v_madd_lo_00 = _mm256_madd_epi16(v_src_lo_0, v_coeff[0]);
+  __m256i v_madd_lo_01 = _mm256_madd_epi16(v_src_lo_0, v_coeff[2]);
+  __m256i v_madd_lo_02 = _mm256_madd_epi16(v_src_lo_0, v_coeff[4]);
+  __m256i v_madd_lo_03 = _mm256_madd_epi16(v_src_lo_0, v_coeff[6]);
+  __m256i v_madd_lo_10 = _mm256_madd_epi16(v_src_lo_1, v_coeff[1]);
+  __m256i v_madd_lo_11 = _mm256_madd_epi16(v_src_lo_1, v_coeff[3]);
+  __m256i v_madd_lo_12 = _mm256_madd_epi16(v_src_lo_1, v_coeff[5]);
+  __m256i v_madd_lo_13 = _mm256_madd_epi16(v_src_lo_1, v_coeff[7]);
+
+  __m256i v_add_hi_0 = _mm256_add_epi32(v_madd_hi_00, v_madd_hi_10);
+  __m256i v_add_hi_1 = _mm256_add_epi32(v_madd_hi_01, v_madd_hi_11);
+  __m256i v_add_hi_2 = _mm256_add_epi32(v_madd_hi_02, v_madd_hi_12);
+  __m256i v_add_hi_3 = _mm256_add_epi32(v_madd_hi_03, v_madd_hi_13);
+
+  __m256i v_add_lo_0 = _mm256_add_epi32(v_madd_lo_00, v_madd_lo_10);
+  __m256i v_add_lo_1 = _mm256_add_epi32(v_madd_lo_01, v_madd_lo_11);
+  __m256i v_add_lo_2 = _mm256_add_epi32(v_madd_lo_02, v_madd_lo_12);
+  __m256i v_add_lo_3 = _mm256_add_epi32(v_madd_lo_03, v_madd_lo_13);
+
+  __m256i v_trunc_hi_0 = truncate_avx2(v_add_hi_0, debias, shift_2nd);
+  __m256i v_trunc_hi_1 = truncate_avx2(v_add_hi_1, debias, shift_2nd);
+  __m256i v_trunc_hi_2 = truncate_avx2(v_add_hi_2, debias, shift_2nd);
+  __m256i v_trunc_hi_3 = truncate_avx2(v_add_hi_3, debias, shift_2nd);
+
+  __m256i v_trunc_lo_0 = truncate_avx2(v_add_lo_0, debias, shift_2nd);
+  __m256i v_trunc_lo_1 = truncate_avx2(v_add_lo_1, debias, shift_2nd);
+  __m256i v_trunc_lo_2 = truncate_avx2(v_add_lo_2, debias, shift_2nd);
+  __m256i v_trunc_lo_3 = truncate_avx2(v_add_lo_3, debias, shift_2nd);
+  
+  __m256i v_result_0 = _mm256_packs_epi32(v_trunc_lo_0, v_trunc_hi_0);
+  __m256i v_result_1 = _mm256_packs_epi32(v_trunc_lo_1, v_trunc_hi_1);
+  __m256i v_result_2 = _mm256_packs_epi32(v_trunc_lo_2, v_trunc_hi_2);
+  __m256i v_result_3 = _mm256_packs_epi32(v_trunc_lo_3, v_trunc_hi_3);
+
+  _mm256_store_si256((__m256i*)dst, v_result_0);
+  _mm256_store_si256((__m256i*)(dst + 16), v_result_1);
+  _mm256_store_si256((__m256i*)(dst + 32), v_result_2);
+  _mm256_store_si256((__m256i*)(dst + 48), v_result_3);
+}
+
+
+static void fast_inverse_tr_16x4_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+
+  const __m256i* v_src_raw = (const __m256i*)src;
+
+  __m256i v_src_lo_0 = _mm256_unpacklo_epi16(v_src_raw[0], v_src_raw[1]);
+  __m256i v_src_lo_1 = _mm256_unpacklo_epi16(v_src_raw[2], v_src_raw[3]);
+  __m256i v_src_hi_0 = _mm256_unpackhi_epi16(v_src_raw[0], v_src_raw[1]);
+  __m256i v_src_hi_1 = _mm256_unpackhi_epi16(v_src_raw[2], v_src_raw[3]);
+
+  __m256i v_madd_lo_0[4];
+  __m256i v_madd_lo_1[4];
+  __m256i v_madd_hi_0[4];
+  __m256i v_madd_hi_1[4];
+  for (int i = 0; i < 4; i++) {
+    v_madd_lo_0[i] = _mm256_madd_epi16(v_src_lo_0, v_coeff[0]);
+    v_madd_lo_1[i] = _mm256_madd_epi16(v_src_lo_1, v_coeff[1]);
+
+    v_madd_hi_0[i] = _mm256_madd_epi16(v_src_hi_0, v_coeff[0]);
+    v_madd_hi_1[i] = _mm256_madd_epi16(v_src_hi_1, v_coeff[1]);
+
+    v_coeff += 2;
+  }
+
+  __m256i v_trunc_lo[4];
+  __m256i v_trunc_hi[4];
+  for (int i = 0; i < 4; ++i) {
+    v_trunc_lo[i] = truncate_avx2(_mm256_add_epi32(v_madd_lo_0[i], v_madd_lo_1[i]), debias, shift);
+    v_trunc_hi[i] = truncate_avx2(_mm256_add_epi32(v_madd_hi_0[i], v_madd_hi_1[i]), debias, shift);
+  }
+
+  dst[0] = _mm256_packs_epi32(v_trunc_lo[0], v_trunc_hi[0]);
+  dst[1] = _mm256_packs_epi32(v_trunc_lo[1], v_trunc_hi[1]);
+  dst[2] = _mm256_packs_epi32(v_trunc_lo[2], v_trunc_hi[2]);
+  dst[3] = _mm256_packs_epi32(v_trunc_lo[3], v_trunc_hi[3]);
+}
+
+static void fast_inverse_tr_16x4_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+  const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)fi_tr_4x8_result_shuffle_ver); // Can use existing shuffle vector
+
+  __m256i v_src_0 = _mm256_permute2x128_si256(src[0], src[2], 0x20);
+  __m256i v_src_1 = _mm256_permute2x128_si256(src[0], src[2], 0x31);
+  __m256i v_src_2 = _mm256_permute2x128_si256(src[1], src[3], 0x20);
+  __m256i v_src_3 = _mm256_permute2x128_si256(src[1], src[3], 0x31);
+
+  __m256i v_madd_0[16];
+  __m256i v_madd_1[16];
+  __m256i v_madd_2[16];
+  __m256i v_madd_3[16];
+  for (int i = 0; i < 16; ++i) {
+    v_madd_0[i] = _mm256_madd_epi16(v_src_0, v_coeff[0]);
+    v_madd_1[i] = _mm256_madd_epi16(v_src_1, v_coeff[1]);
+    v_madd_2[i] = _mm256_madd_epi16(v_src_2, v_coeff[0]);
+    v_madd_3[i] = _mm256_madd_epi16(v_src_3, v_coeff[1]);
+
+    v_coeff += 2;
+  }
+
+  __m256i v_add_0[16];
+  __m256i v_add_1[16];
+  for (int i = 0; i < 16; ++i) {
+    v_add_0[i] = _mm256_add_epi32(v_madd_0[i], v_madd_1[i]);
+    v_add_1[i] = _mm256_add_epi32(v_madd_2[i], v_madd_3[i]);
+
+  }
+
+  __m256i v_hadd_0[16];
+  for (int i = 0; i < 16; ++i) {
+    v_hadd_0[i] = _mm256_hadd_epi32(v_add_0[i], v_add_1[i]);
+  }
+
+  __m256i v_hadd_1[8];
+  for (int dst = 0, src = 0; dst < 8; ++dst, src += 2) {
+    v_hadd_1[dst] = _mm256_hadd_epi32(v_hadd_0[src + 0], v_hadd_0[src + 1]);
+  }
+
+  __m256i v_trunc[8];
+  for (int i = 0; i < 8; ++i) {
+    v_trunc[i] = truncate_avx2(v_hadd_1[i], debias, shift);
+  }
+
+  __m256i v_result[4];
+  __m256i v_tmp0 = _mm256_packs_epi32(v_trunc[0], v_trunc[1]);
+  __m256i v_tmp1 = _mm256_packs_epi32(v_trunc[2], v_trunc[3]);
+  __m256i v_tmp2 = _mm256_packs_epi32(v_trunc[4], v_trunc[5]);
+  __m256i v_tmp3 = _mm256_packs_epi32(v_trunc[6], v_trunc[7]);
+
+  v_tmp0 = _mm256_shuffle_epi8(v_tmp0, v_res_shuffle);
+  v_tmp1 = _mm256_shuffle_epi8(v_tmp1, v_res_shuffle);
+  v_tmp2 = _mm256_shuffle_epi8(v_tmp2, v_res_shuffle);
+  v_tmp3 = _mm256_shuffle_epi8(v_tmp3, v_res_shuffle);
+
+  __m256i v_tmp_lo_0 = _mm256_unpacklo_epi64(v_tmp0, v_tmp1);
+  __m256i v_tmp_lo_1 = _mm256_unpacklo_epi64(v_tmp2, v_tmp3);
+  __m256i v_tmp_hi_0 = _mm256_unpackhi_epi64(v_tmp0, v_tmp1);
+  __m256i v_tmp_hi_1 = _mm256_unpackhi_epi64(v_tmp2, v_tmp3);
+
+  v_result[0] = _mm256_permute2x128_si256(v_tmp_lo_0, v_tmp_lo_1, 0x20);
+  v_result[1] = _mm256_permute2x128_si256(v_tmp_hi_0, v_tmp_hi_1, 0x20);
+  v_result[2] = _mm256_permute2x128_si256(v_tmp_lo_0, v_tmp_lo_1, 0x31);
+  v_result[3] = _mm256_permute2x128_si256(v_tmp_hi_0, v_tmp_hi_1, 0x31);
+
+  for (int i = 0; i < 4; ++i) {
+    _mm256_store_si256((__m256i*)dst, v_result[i]);
+    dst += 16;
+  }
+}
+
+void fast_inverse_tr_16x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 16;
+  const int height = 4;
+
+  int skip_width = 0;
+  int skip_height = 0;
+
+  const int32_t shift_1st = INVERSE_SHIFT_1ST;
+  const int32_t shift_2nd = INVERSE_SHIFT_2ND;
+
+  const int16_t* ver_coeff = fi_dct2_4x16_coeff_hor; // TODO: rename coeff tables
+  const int16_t* hor_coeff = fi_dct2_4x16_coeff_ver;
+  if (hor == DST7) {
+    hor_coeff = fi_dst7_4x16_coeff_ver;
+  } else if (hor == DCT8) {
+    hor_coeff = fi_dct8_4x16_coeff_ver;
+  }
+  if (ver == DST7) {
+    ver_coeff = fi_dst7_4x16_coeff_hor;
+  } else if (ver == DCT8) {
+    ver_coeff = fi_dct8_4x16_coeff_hor;
+  }
+
+  __m256i v_ver_pass_out[4];
+  fast_inverse_tr_16x4_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height);
+  
+  fast_inverse_tr_16x4_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width);
+}
+
+
+void fast_forward_tr_16x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 16;
+  const int height = 8;
+  
+  int skip_width = 0;
+  int skip_height = 0;
+
+  const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1;
+  const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
+  const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8;
+  const int32_t shift_2nd = log2_height_minus1 + 7;
+
+  const int16_t* hor_coeff = ff_dct2_16xN_coeff_hor;
+  const int16_t* ver_coeff = ff_dct2_16x8_coeff_ver;
+  if (hor == DST7) {
+    hor_coeff = ff_dst7_16xN_coeff_hor;
+  } else if (hor == DCT8) {
+    hor_coeff = ff_dct8_16xN_coeff_hor;
+  }
+  if (ver == DST7) {
+    ver_coeff = ff_dst7_16x8_coeff_ver;
+  } else if (ver == DCT8) {
+    ver_coeff = ff_dct8_16x8_coeff_ver;
+  }
+
+  __m256i v_hor_pass_out[8];
+  fast_forward_DCT2_B16_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);
+
+  // Vertical pass
+  const int32_t    add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const int32_t* line_coeff = (const int32_t*)ver_coeff;
+
+  // Got 8 lines of samples. Handle two lines at a time (beacuse of unpack)
+  __m256i v_madd_hi[4][8];
+  __m256i v_madd_lo[4][8];
+  __m256i* v_src_ptr = v_hor_pass_out;
+  for (int i = 0; i < 4; ++i) {
+    __m256i v_src_hi = _mm256_unpackhi_epi16(v_src_ptr[0], v_src_ptr[1]);
+    __m256i v_src_lo = _mm256_unpacklo_epi16(v_src_ptr[0], v_src_ptr[1]);
+
+    // Apply coefficients
+    for (int ii = 0; ii < 8; ++ii) {
+      const int32_t coeff = line_coeff[ii];
+      const __m256i v_coeff = _mm256_set1_epi32(coeff);
+      v_madd_hi[i][ii] = _mm256_madd_epi16(v_src_hi, v_coeff);
+      v_madd_lo[i][ii] = _mm256_madd_epi16(v_src_lo, v_coeff);
+    }
+
+    line_coeff += 8;
+    v_src_ptr += 2;
+  }
+
+  // First round of additions
+  __m256i v_add_hi[2][8];
+  __m256i v_add_lo[2][8];
+  for (int i = 0; i < 2; ++i) {
+    for (int ii = 0; ii < 8; ++ii) {
+      const int offset = i * 2;
+      v_add_hi[i][ii] = _mm256_add_epi32(v_madd_hi[offset][ii], v_madd_hi[offset + 1][ii]);
+      v_add_lo[i][ii] = _mm256_add_epi32(v_madd_lo[offset][ii], v_madd_lo[offset + 1][ii]);
+    }
+  }
+
+  // Final round of additions, truncation and store
+  for (int ii = 0; ii < 8; ++ii) {
+    __m256i v_trunc_hi = truncate_avx2(_mm256_add_epi32(v_add_hi[0][ii], v_add_hi[1][ii]), debias, shift_2nd);
+    __m256i v_trunc_lo = truncate_avx2(_mm256_add_epi32(v_add_lo[0][ii], v_add_lo[1][ii]), debias, shift_2nd);
+    __m256i v_result = _mm256_packs_epi32(v_trunc_lo, v_trunc_hi);
+
+    _mm256_store_si256((__m256i*)dst, v_result);
+    dst += 16;
+  }
+}
+
+
+static void fast_inverse_tr_16x8_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+  const __m256i* v_src_raw = (const __m256i*)src;
+
+  __m256i v_src_lo[4];
+  __m256i v_src_hi[4];
+  for (int dst = 0, src = 0; dst < 4; ++dst, src += 2) {
+    v_src_lo[dst] = _mm256_unpacklo_epi16(v_src_raw[src + 0], v_src_raw[src + 1]);
+    v_src_hi[dst] = _mm256_unpackhi_epi16(v_src_raw[src + 0], v_src_raw[src + 1]);
+  }
+
+  __m256i v_trunc_lo[8];
+  __m256i v_trunc_hi[8];
+
+  for (int c = 0; c < 8; c++) {
+    __m256i v_madd_lo[4];
+    __m256i v_madd_hi[4];
+    for (int i = 0; i < 4; ++i) {
+      v_madd_lo[i] = _mm256_madd_epi16(v_src_lo[i], v_coeff[i]);
+      v_madd_hi[i] = _mm256_madd_epi16(v_src_hi[i], v_coeff[i]);
+    }
+    v_coeff += 4;
+
+    __m256i v_add_lo_0 = _mm256_add_epi32(v_madd_lo[0], v_madd_lo[1]);
+    __m256i v_add_lo_1 = _mm256_add_epi32(v_madd_lo[2], v_madd_lo[3]);
+
+    __m256i v_add_hi_0 = _mm256_add_epi32(v_madd_hi[0], v_madd_hi[1]);
+    __m256i v_add_hi_1 = _mm256_add_epi32(v_madd_hi[2], v_madd_hi[3]);
+
+    v_trunc_lo[c] = truncate_avx2(_mm256_add_epi32(v_add_lo_0, v_add_lo_1), debias, shift);
+    v_trunc_hi[c] = truncate_avx2(_mm256_add_epi32(v_add_hi_0, v_add_hi_1), debias, shift);
+  }
+
+  for (int i = 0; i < 8; ++i) {
+    dst[i] = _mm256_packs_epi32(v_trunc_lo[i], v_trunc_hi[i]);
+  }
+}
+
+static void fast_inverse_tr_16x8_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+  const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0415);
+
+  __m256i v_tmp32_lo_0 = _mm256_unpacklo_epi32(src[0], src[1]);
+  __m256i v_tmp32_lo_1 = _mm256_unpacklo_epi32(src[2], src[3]);
+  __m256i v_tmp32_lo_2 = _mm256_unpacklo_epi32(src[4], src[5]);
+  __m256i v_tmp32_lo_3 = _mm256_unpacklo_epi32(src[6], src[7]);
+
+  __m256i v_tmp32_hi_0 = _mm256_unpackhi_epi32(src[0], src[1]);
+  __m256i v_tmp32_hi_1 = _mm256_unpackhi_epi32(src[2], src[3]);
+  __m256i v_tmp32_hi_2 = _mm256_unpackhi_epi32(src[4], src[5]);
+  __m256i v_tmp32_hi_3 = _mm256_unpackhi_epi32(src[6], src[7]);
+
+  __m256i v_tmp64_lo_0 = _mm256_unpacklo_epi64(v_tmp32_lo_0, v_tmp32_lo_1);
+  __m256i v_tmp64_lo_1 = _mm256_unpacklo_epi64(v_tmp32_lo_2, v_tmp32_lo_3);
+  __m256i v_tmp64_lo_2 = _mm256_unpacklo_epi64(v_tmp32_hi_0, v_tmp32_hi_1);
+  __m256i v_tmp64_lo_3 = _mm256_unpacklo_epi64(v_tmp32_hi_2, v_tmp32_hi_3);
+
+  __m256i v_tmp64_hi_0 = _mm256_unpackhi_epi64(v_tmp32_lo_0, v_tmp32_lo_1);
+  __m256i v_tmp64_hi_1 = _mm256_unpackhi_epi64(v_tmp32_lo_2, v_tmp32_lo_3);
+  __m256i v_tmp64_hi_2 = _mm256_unpackhi_epi64(v_tmp32_hi_0, v_tmp32_hi_1);
+  __m256i v_tmp64_hi_3 = _mm256_unpackhi_epi64(v_tmp32_hi_2, v_tmp32_hi_3);
+
+  __m256i v_src[8];
+  v_src[0] = _mm256_permute2x128_si256(v_tmp64_lo_0, v_tmp64_lo_1, 0x20);
+  v_src[1] = _mm256_permute2x128_si256(v_tmp64_hi_0, v_tmp64_hi_1, 0x20);
+  v_src[2] = _mm256_permute2x128_si256(v_tmp64_lo_2, v_tmp64_lo_3, 0x20);
+  v_src[3] = _mm256_permute2x128_si256(v_tmp64_hi_2, v_tmp64_hi_3, 0x20);
+  v_src[4] = _mm256_permute2x128_si256(v_tmp64_lo_0, v_tmp64_lo_1, 0x31);
+  v_src[5] = _mm256_permute2x128_si256(v_tmp64_hi_0, v_tmp64_hi_1, 0x31);
+  v_src[6] = _mm256_permute2x128_si256(v_tmp64_lo_2, v_tmp64_lo_3, 0x31);
+  v_src[7] = _mm256_permute2x128_si256(v_tmp64_hi_2, v_tmp64_hi_3, 0x31);
+
+
+  __m256i v_trunc[16];
+  for (int c = 0; c < 16; ++c) {
+    __m256i v_madd[8];
+    for (int i = 0; i < 8; ++i) {
+      v_madd[i] = _mm256_madd_epi16(v_src[i], v_coeff[i]);
+    }
+    v_coeff += 8;
+
+    __m256i v_add_0[4];
+    for (int dst = 0, src = 0; dst < 4; ++dst, src += 2) {
+      v_add_0[dst] = _mm256_add_epi32(v_madd[src + 0], v_madd[src + 1]);
+    }
+
+    __m256i v_add_10 = _mm256_add_epi32(v_add_0[0], v_add_0[1]);
+    __m256i v_add_11 = _mm256_add_epi32(v_add_0[2], v_add_0[3]);
+
+    v_trunc[c] = truncate_avx2(_mm256_add_epi32(v_add_10, v_add_11), debias, shift);
+  }
+
+  __m256i v_result[8];
+  for (int dst = 0, src = 0; dst < 8; ++dst, src += 2) {
+    v_result[dst] = _mm256_packs_epi32(v_trunc[src + 0], v_trunc[src + 1]);
+  }
+
+  for (int i = 0; i < 8; ++i) {
+    v_result[i] = _mm256_shuffle_epi8(v_result[i], v_res_shuffle);
+  }
+
+  __m256i v_rtmp32_lo_0 = _mm256_unpacklo_epi32(v_result[0], v_result[1]);
+  __m256i v_rtmp32_lo_1 = _mm256_unpacklo_epi32(v_result[2], v_result[3]);
+  __m256i v_rtmp32_lo_2 = _mm256_unpacklo_epi32(v_result[4], v_result[5]);
+  __m256i v_rtmp32_lo_3 = _mm256_unpacklo_epi32(v_result[6], v_result[7]);
+
+  __m256i v_rtmp32_hi_0 = _mm256_unpackhi_epi32(v_result[0], v_result[1]);
+  __m256i v_rtmp32_hi_1 = _mm256_unpackhi_epi32(v_result[2], v_result[3]);
+  __m256i v_rtmp32_hi_2 = _mm256_unpackhi_epi32(v_result[4], v_result[5]);
+  __m256i v_rtmp32_hi_3 = _mm256_unpackhi_epi32(v_result[6], v_result[7]);
+
+  __m256i v_rtmp64_lo_0 = _mm256_unpacklo_epi64(v_rtmp32_lo_0, v_rtmp32_lo_1);
+  __m256i v_rtmp64_lo_1 = _mm256_unpacklo_epi64(v_rtmp32_lo_2, v_rtmp32_lo_3);
+  __m256i v_rtmp64_lo_2 = _mm256_unpacklo_epi64(v_rtmp32_hi_0, v_rtmp32_hi_1);
+  __m256i v_rtmp64_lo_3 = _mm256_unpacklo_epi64(v_rtmp32_hi_2, v_rtmp32_hi_3);
+
+  __m256i v_rtmp64_hi_0 = _mm256_unpackhi_epi64(v_rtmp32_lo_0, v_rtmp32_lo_1);
+  __m256i v_rtmp64_hi_1 = _mm256_unpackhi_epi64(v_rtmp32_lo_2, v_rtmp32_lo_3);
+  __m256i v_rtmp64_hi_2 = _mm256_unpackhi_epi64(v_rtmp32_hi_0, v_rtmp32_hi_1);
+  __m256i v_rtmp64_hi_3 = _mm256_unpackhi_epi64(v_rtmp32_hi_2, v_rtmp32_hi_3);
+
+  v_result[0] = _mm256_permute2x128_si256(v_rtmp64_lo_0, v_rtmp64_lo_1, 0x20);
+  v_result[1] = _mm256_permute2x128_si256(v_rtmp64_hi_0, v_rtmp64_hi_1, 0x20);
+  v_result[2] = _mm256_permute2x128_si256(v_rtmp64_lo_2, v_rtmp64_lo_3, 0x20);
+  v_result[3] = _mm256_permute2x128_si256(v_rtmp64_hi_2, v_rtmp64_hi_3, 0x20);
+
+  v_result[4] = _mm256_permute2x128_si256(v_rtmp64_lo_0, v_rtmp64_lo_1, 0x31);
+  v_result[5] = _mm256_permute2x128_si256(v_rtmp64_hi_0, v_rtmp64_hi_1, 0x31);
+  v_result[6] = _mm256_permute2x128_si256(v_rtmp64_lo_2, v_rtmp64_lo_3, 0x31);
+  v_result[7] = _mm256_permute2x128_si256(v_rtmp64_hi_2, v_rtmp64_hi_3, 0x31);
+
+  for (int i = 0; i < 8; ++i) {
+    _mm256_store_si256((__m256i*)dst, v_result[i]);
+    dst += 16;
+  }
+}
+
+void fast_inverse_tr_16x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 16;
+  const int height = 8;
+
+  int skip_width = 0;
+  int skip_height = 0;
+
+  const int32_t shift_1st = INVERSE_SHIFT_1ST;
+  const int32_t shift_2nd = INVERSE_SHIFT_2ND;
+
+  const int16_t* ver_coeff = fi_dct2_8x16_coeff_hor;
+  const int16_t* hor_coeff = fi_dct2_8x16_coeff_ver;
+  if (hor == DST7) {
+    hor_coeff = fi_dst7_8x16_coeff_ver;
+  } else if (hor == DCT8) {
+    hor_coeff = fi_dct8_8x16_coeff_ver;
+  }
+  if (ver == DST7) {
+    ver_coeff = fi_dst7_8x16_coeff_hor;
+  } else if (ver == DCT8) {
+    ver_coeff = fi_dct8_8x16_coeff_hor;
+  }
+
+  __m256i v_ver_pass_out[8];
+  fast_inverse_tr_16x8_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height);
+  
+  fast_inverse_tr_16x8_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width);
+}
+
+
+void fast_forward_tr_16x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 16;
+  const int height = 16;
+  
+  int skip_width = 0;
+  int skip_height = 0; 
+
+  const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1;
+  const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
+  const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8;
+  const int32_t shift_2nd = log2_height_minus1 + 7;
+
+  const int16_t* hor_coeff = ff_dct2_16xN_coeff_hor;
+  const int16_t* ver_coeff = ff_dct2_16x16_coeff_ver;
+  if (hor == DST7) {
+    hor_coeff = ff_dst7_16xN_coeff_hor;
+  } else if (hor == DCT8) {
+    hor_coeff = ff_dct8_16xN_coeff_hor;
+  }
+  if (ver == DST7) {
+    ver_coeff = ff_dst7_16x16_coeff_ver;
+  } else if (ver == DCT8) {
+    ver_coeff = ff_dct8_16x16_coeff_ver;
+  }
+
+  __m256i v_hor_pass_out[16];
+  fast_forward_DCT2_B16_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);
+
+  // Vertical pass
+  const int32_t    add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const __m256i debias = _mm256_set1_epi32(add);
+
+#define NUM_PARTS 4
+#define PART_DIMENSION (16 / NUM_PARTS)
+  for (int part = 0; part < NUM_PARTS; ++part) {
+    const int32_t* coeff_ptr = (const int32_t*)ver_coeff + part * PART_DIMENSION; // Cast into 32 bit integer to read two coeffs at a time
+    const __m256i* v_src_ptr = v_hor_pass_out;
+
+    __m256i v_madd_lo[8][PART_DIMENSION];
+    __m256i v_madd_hi[8][PART_DIMENSION];
+    for (int i = 0; i < 8; ++i) {
+      __m256i v_src_lo = _mm256_unpacklo_epi16(v_src_ptr[0], v_src_ptr[1]);
+      __m256i v_src_hi = _mm256_unpackhi_epi16(v_src_ptr[0], v_src_ptr[1]);
+
+      for (int c = 0; c < PART_DIMENSION; ++c) {
+        const __m256i v_coeff = _mm256_set1_epi32(coeff_ptr[c]);
+        v_madd_lo[i][c] = _mm256_madd_epi16(v_src_lo, v_coeff);
+        v_madd_hi[i][c] = _mm256_madd_epi16(v_src_hi, v_coeff);
+      }
+      v_src_ptr += 2;
+      coeff_ptr += 16;
+    }
+
+    __m256i v_trunc_lo[PART_DIMENSION];
+    __m256i v_trunc_hi[PART_DIMENSION];
+    for (int i = 0; i < PART_DIMENSION; ++i) {
+      __m256i v_add_lo_0[4];
+      __m256i v_add_hi_0[4];
+      for (int dst = 0, src = 0; dst < 4; ++dst, src += 2) {
+        v_add_lo_0[dst] = _mm256_add_epi32(v_madd_lo[src + 0][i], v_madd_lo[src + 1][i]);
+        v_add_hi_0[dst] = _mm256_add_epi32(v_madd_hi[src + 0][i], v_madd_hi[src + 1][i]);
+      }
+
+      __m256i v_add_lo_1[2];
+      __m256i v_add_hi_1[2];
+      for (int dst = 0, src = 0; dst < 2; ++dst, src += 2) {
+        v_add_lo_1[dst] = _mm256_add_epi32(v_add_lo_0[src + 0], v_add_lo_0[src + 1]);
+        v_add_hi_1[dst] = _mm256_add_epi32(v_add_hi_0[src + 0], v_add_hi_0[src + 1]);
+      }
+
+      v_trunc_lo[i] = truncate_avx2(_mm256_add_epi32(v_add_lo_1[0], v_add_lo_1[1]), debias, shift_2nd);
+      v_trunc_hi[i] = truncate_avx2(_mm256_add_epi32(v_add_hi_1[0], v_add_hi_1[1]), debias, shift_2nd);
+    }
+    __m256i v_result[PART_DIMENSION];
+    for (int i = 0; i < PART_DIMENSION; ++i) {
+      v_result[i] = _mm256_packs_epi32(v_trunc_lo[i], v_trunc_hi[i]);
+    }
+
+    for (int i = 0; i < PART_DIMENSION; ++i) {
+      _mm256_store_si256((__m256i*)dst, v_result[i]);
+      dst += 16;
+    }
+  }
+  
+#undef NUM_PARTS
+#undef PART_DIMENSION
+
+}
+
+
+static void fast_inverse_tr_16x16_avx2_hor(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  //const int32_t* c_ptr = (const int32_t*)coeff; // Handle as 32 bit integer to load two coeffs into vector at the same time
+  //const __m256i* v_src_raw = (const __m256i*)src;
+
+  //__m256i v_madd_lo[8][16];
+  //__m256i v_madd_hi[8][16];
+  //for (int s = 0; s < 8; ++s) {
+  //  __m256i v_src_lo = _mm256_unpacklo_epi16(v_src_raw[0], v_src_raw[1]);
+  //  __m256i v_src_hi = _mm256_unpackhi_epi16(v_src_raw[0], v_src_raw[1]);
+  //  v_src_raw += 2;
+
+  //  for (int c = 0; c < 16; ++c) {
+  //    const __m256i v_coeff = _mm256_set1_epi32(*c_ptr);
+  //    v_madd_lo[s][c] = _mm256_madd_epi16(v_src_lo, v_coeff);
+  //    v_madd_hi[s][c] = _mm256_madd_epi16(v_src_hi, v_coeff);
+  //    c_ptr++;
+  //  }
+  //}
+
+  //__m256i v_add_lo_0[4][16];
+  //__m256i v_add_hi_0[4][16];
+  //for (int s = 0, d = 0; d < 4; ++d, s += 2) {
+  //  for (int c = 0; c < 16; ++c) {
+  //    v_add_lo_0[d][c] = _mm256_add_epi32(v_madd_lo[s + 0][c], v_madd_lo[s + 1][c]);
+  //    v_add_hi_0[d][c] = _mm256_add_epi32(v_madd_hi[s + 0][c], v_madd_hi[s + 1][c]);
+  //  }
+  //}
+
+  //__m256i v_add_lo_1[2][16];
+  //__m256i v_add_hi_1[2][16];
+  //for (int s = 0, d = 0; d < 2; ++d, s += 2) {
+  //  for (int c = 0; c < 16; ++c) {
+  //    v_add_lo_1[d][c] = _mm256_add_epi32(v_add_lo_0[s + 0][c], v_add_lo_0[s + 1][c]);
+  //    v_add_hi_1[d][c] = _mm256_add_epi32(v_add_hi_0[s + 0][c], v_add_hi_0[s + 1][c]);
+  //  }
+  //}
+
+  //__m256i v_trunc_lo[16];
+  //__m256i v_trunc_hi[16];
+  //for (int c = 0; c < 16; ++c) {
+  //  v_trunc_lo[c] = truncate_avx2(_mm256_add_epi32(v_add_lo_1[0][c], v_add_lo_1[1][c]), debias, shift);
+  //  v_trunc_hi[c] = truncate_avx2(_mm256_add_epi32(v_add_hi_1[0][c], v_add_hi_1[1][c]), debias, shift);
+  //}
+
+  //for (int i = 0; i < 16; ++i) {
+  //  dst[i] = _mm256_packs_epi32(v_trunc_lo[i], v_trunc_hi[i]);
+  //}
+
+  for (int j = 0; j < 16; ++j) {
+    __m256i res_0 = _mm256_setzero_si256();
+    __m256i res_1 = _mm256_setzero_si256();
+
+    __m256i *coeff_start = (__m256i*)coeff;
+    for (int i = 0; i < 8; ++i) {
+      int16_t source[2];
+      source[0] = src[j + i * 32];
+      source[1] = src[j + i * 32 + 16];
+      int32_t paired_source;
+      memcpy(&paired_source, source, sizeof(int32_t));
+
+      __m256i v_src = _mm256_set1_epi32(paired_source);
+
+      __m256i v_coeff0 = _mm256_load_si256(coeff_start);
+      coeff_start++;
+      __m256i v_coeff1 = _mm256_load_si256(coeff_start);
+      coeff_start++;
+
+      __m256i v_madd0 = _mm256_madd_epi16(v_src, v_coeff0);
+      __m256i v_madd1 = _mm256_madd_epi16(v_src, v_coeff1);
+
+      res_0 = _mm256_add_epi32(res_0, v_madd0);
+      res_1 = _mm256_add_epi32(res_1, v_madd1);
+    }
+
+    __m256i v_trunc0 = truncate_avx2(res_0, debias, shift);
+    __m256i v_trunc1 = truncate_avx2(res_1, debias, shift);
+
+    __m256i packed = _mm256_packs_epi32(v_trunc0, v_trunc1);
+    packed = _mm256_permute4x64_epi64(packed, _MM_SHUFFLE(3, 1, 2, 0));
+    dst[j] = packed;
+  }
+}
+
+static void fast_inverse_tr_16x16_avx2_ver(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  __m256i v_result[16];
+  int16_t *src_p = (int16_t*)src;
+  for (int j = 0; j < 16; ++j) {
+    __m256i res_0 = _mm256_setzero_si256();
+    __m256i res_1 = _mm256_setzero_si256();
+    __m256i* coeff_start = (__m256i*)coeff;
+    for (int i = 0; i < 8; ++i) {
+      int16_t source[2];
+      source[0] = src_p[j + i * 32];
+      source[1] = src_p[j + i * 32 + 16];
+      int32_t paired_source;
+      memcpy(&paired_source, source, sizeof(int32_t));
+
+      __m256i v_src = _mm256_set1_epi32(paired_source);
+
+      __m256i coeff_0 = _mm256_load_si256(coeff_start);
+      coeff_start++;
+      __m256i coeff_1 = _mm256_load_si256(coeff_start);
+      coeff_start++;
+
+      __m256i madd0 = _mm256_madd_epi16(v_src, coeff_0);
+      __m256i madd1 = _mm256_madd_epi16(v_src, coeff_1);
+
+      res_0 = _mm256_add_epi32(res_0, madd0);
+      res_1 = _mm256_add_epi32(res_1, madd1);
+    }
+
+    __m256i v_trunc0 = truncate_avx2(res_0, debias, shift);
+    __m256i v_trunc1 = truncate_avx2(res_1, debias, shift);
+
+    __m256i packed = _mm256_packs_epi32(v_trunc0, v_trunc1);
+    packed = _mm256_permute4x64_epi64(packed, _MM_SHUFFLE(3, 1, 2, 0));
+    _mm256_store_si256((__m256i *)dst, packed);
+    dst += 16;
+  }
+  //const int32_t* c_ptr = (const int32_t*)coeff; // Handle as 32 bit integer to load two coeffs into vector at the same time
+  //const __m256i* v_src_raw = src;
+
+  //// Do a 32-bit transpose to arrange result from previous pass
+  //__m256i v_tmp32_lo[8];
+  //__m256i v_tmp32_hi[8];
+  //for (int d = 0, s = 0; d < 8; ++d, s += 2) {
+  //  v_tmp32_lo[d] = _mm256_unpacklo_epi32(v_src_raw[s + 0], v_src_raw[s + 1]);
+  //  v_tmp32_hi[d] = _mm256_unpackhi_epi32(v_src_raw[s + 0], v_src_raw[s + 1]);
+  //}
+
+  //__m256i v_tmp64_lo[8];
+  //__m256i v_tmp64_hi[8];
+  //for (int d = 0, s = 0; d < 4; ++d, s += 2) {
+  //  v_tmp64_lo[0 + d] = _mm256_unpacklo_epi64(v_tmp32_lo[s + 0], v_tmp32_lo[s + 1]);
+  //  v_tmp64_lo[4 + d] = _mm256_unpacklo_epi64(v_tmp32_hi[s + 0], v_tmp32_hi[s + 1]);
+
+  //  v_tmp64_hi[0 + d] = _mm256_unpackhi_epi64(v_tmp32_lo[s + 0], v_tmp32_lo[s + 1]);
+  //  v_tmp64_hi[4 + d] = _mm256_unpackhi_epi64(v_tmp32_hi[s + 0], v_tmp32_hi[s + 1]);
+  //}
+  //
+  //__m256i v_src[16];
+  //v_src[ 0] = _mm256_permute2x128_si256(v_tmp64_lo[0], v_tmp64_lo[1], 0x20);
+  //v_src[ 1] = _mm256_permute2x128_si256(v_tmp64_hi[0], v_tmp64_hi[1], 0x20);
+  //v_src[ 2] = _mm256_permute2x128_si256(v_tmp64_lo[4], v_tmp64_lo[5], 0x20);
+  //v_src[ 3] = _mm256_permute2x128_si256(v_tmp64_hi[4], v_tmp64_hi[5], 0x20);
+  //v_src[ 4] = _mm256_permute2x128_si256(v_tmp64_lo[0], v_tmp64_lo[1], 0x31);
+  //v_src[ 5] = _mm256_permute2x128_si256(v_tmp64_hi[0], v_tmp64_hi[1], 0x31);
+  //v_src[ 6] = _mm256_permute2x128_si256(v_tmp64_lo[4], v_tmp64_lo[5], 0x31);
+  //v_src[ 7] = _mm256_permute2x128_si256(v_tmp64_hi[4], v_tmp64_hi[5], 0x31);
+
+  //v_src[ 8] = _mm256_permute2x128_si256(v_tmp64_lo[2], v_tmp64_lo[3], 0x20);
+  //v_src[ 9] = _mm256_permute2x128_si256(v_tmp64_hi[2], v_tmp64_hi[3], 0x20);
+  //v_src[10] = _mm256_permute2x128_si256(v_tmp64_lo[6], v_tmp64_lo[7], 0x20);
+  //v_src[11] = _mm256_permute2x128_si256(v_tmp64_hi[6], v_tmp64_hi[7], 0x20);
+  //v_src[12] = _mm256_permute2x128_si256(v_tmp64_lo[2], v_tmp64_lo[3], 0x31);
+  //v_src[13] = _mm256_permute2x128_si256(v_tmp64_hi[2], v_tmp64_hi[3], 0x31);
+  //v_src[14] = _mm256_permute2x128_si256(v_tmp64_lo[6], v_tmp64_lo[7], 0x31);
+  //v_src[15] = _mm256_permute2x128_si256(v_tmp64_hi[6], v_tmp64_hi[7], 0x31);
+
+  //__m256i v_madd_0[8][16];
+  //__m256i v_madd_1[8][16];
+  //for (int s = 0; s < 8; ++s) {
+  //  for (int c = 0; c < 16; ++c) {
+  //    const __m256i v_coeff = _mm256_set1_epi32(*c_ptr);
+  //    v_madd_0[s][c] = _mm256_madd_epi16(v_src[0 + s], v_coeff);
+  //    v_madd_1[s][c] = _mm256_madd_epi16(v_src[8 + s], v_coeff);
+  //    c_ptr++;
+  //  }
+  //}
+
+  //__m256i v_add_00[4][16];
+  //__m256i v_add_01[4][16];
+  //for (int s = 0, d = 0; d < 4; ++d, s += 2) {
+  //  for (int c = 0; c < 16; ++c) {
+  //    v_add_00[d][c] = _mm256_add_epi32(v_madd_0[s + 0][c], v_madd_0[s + 1][c]);
+  //    v_add_01[d][c] = _mm256_add_epi32(v_madd_1[s + 0][c], v_madd_1[s + 1][c]);
+  //  }
+  //}
+
+  //__m256i v_add_10[2][16];
+  //__m256i v_add_11[2][16];
+  //for (int s = 0, d = 0; d < 2; ++d, s += 2) {
+  //  for (int c = 0; c < 16; ++c) {
+  //    v_add_10[d][c] = _mm256_add_epi32(v_add_00[s + 0][c], v_add_00[s + 1][c]);
+  //    v_add_11[d][c] = _mm256_add_epi32(v_add_01[s + 0][c], v_add_01[s + 1][c]);
+  //  }
+  //}
+
+  //__m256i v_trunc_0[16];
+  //__m256i v_trunc_1[16];
+  //for (int c = 0; c < 16; ++c) {
+  //  v_trunc_0[c] = truncate_avx2(_mm256_add_epi32(v_add_10[0][c], v_add_10[1][c]), debias, shift);
+  //  v_trunc_1[c] = truncate_avx2(_mm256_add_epi32(v_add_11[0][c], v_add_11[1][c]), debias, shift);
+  //}
+
+  //__m256i v_result[16];
+  //for (int d = 0; d < 16; ++d) {
+  //  v_result[d] = _mm256_packs_epi32(v_trunc_0[d], v_trunc_1[d]);
+  //}
+  //for (int d = 0; d < 16; ++d) {
+  //  v_result[d] = _mm256_permute4x64_epi64(v_result[d], _MM_SHUFFLE(3, 1, 2, 0));
+  //}
+  
+  //transpose_avx2(v_result, (__m256i*)dst, 16, 16);
+}
+
+void fast_inverse_tr_16x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 16;
+  const int height = 16;
+
+  int skip_width = 0;
+  int skip_height = 0;
+
+  const int32_t shift_1st = INVERSE_SHIFT_1ST;
+  const int32_t shift_2nd = INVERSE_SHIFT_2ND;
+
+  const int16_t* hor_coeff = fi_dct2_16x16_coeff_hor;
+  const int16_t* ver_coeff = fi_dct2_16x16_coeff_ver;
+  if (hor == DST7) {
+    hor_coeff = fi_dst7_16x16_coeff_hor;
+  } else if (hor == DCT8) {
+    hor_coeff = fi_dct8_16x16_coeff_hor;
+  }
+  if (ver == DST7) {
+    ver_coeff = fi_dst7_16x16_coeff_ver;
+  } else if (ver == DCT8) {
+    ver_coeff = fi_dct8_16x16_coeff_ver;
+  }
+
+  __m256i v_hor_pass_out[16];
+  fast_inverse_tr_16x16_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);
+
+  fast_inverse_tr_16x16_avx2_ver(v_hor_pass_out, dst, ver_coeff, shift_2nd, width, skip_width, skip_height);
+}
+
+
+void fast_forward_tr_16x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 16;
+  const int height = 32;
+  
+  int skip_width = 0;
+  int skip_height = (ver != DCT2 && height == 32) ? 16 : (height > 32 ? height - 32 : 0);
+
+  const int  reduced_line = width - skip_width;
+  const int  cutoff = height - skip_height;
+  int16_t* p_dst = dst;
+
+  const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1;
+  const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
+  const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8;
+  const int32_t shift_2nd = log2_height_minus1 + 7;
+
+  const int16_t* hor_coeff = ff_dct2_16xN_coeff_hor;
+  const int16_t* ver_coeff = ff_dct2_16x32_coeff_ver;
+  if (hor == DST7) {
+    hor_coeff = ff_dst7_16xN_coeff_hor;
+  } else if (hor == DCT8) {
+    hor_coeff = ff_dct8_16xN_coeff_hor;
+  }
+  if (ver == DST7) {
+    ver_coeff = ff_dst7_16x32_coeff_ver;
+  } else if (ver == DCT8) {
+    ver_coeff = ff_dct8_16x32_coeff_ver;
+  }
+
+  int16_t v_hor_pass_out[32*16];
+  fast_forward_DCT2_B16_avx2_hor(src, (__m256i*)v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);
+
+
+  __m256i temp_out[32];
+  // Vertical pass
+  const int32_t    add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const __m256i debias = _mm256_set1_epi32(add);
+  if(ver == DCT2) {
+    for (int j = 0; j < 16; ++j) {
+      __m256i res_0 = _mm256_setzero_si256();
+      __m256i res_1 = _mm256_setzero_si256();
+      __m256i res_2 = _mm256_setzero_si256();
+      __m256i res_3 = _mm256_setzero_si256();
+      const int16_t* coeff_start = ver_coeff;
+      for (int i = 0; i < 16; ++i) {
+        int16_t source[2];
+        source[0] = v_hor_pass_out[j + i * 32];
+        source[1] = v_hor_pass_out[j + i * 32 + 16];
+        int32_t paired_source;
+        memcpy(&paired_source, source, sizeof(int32_t));
+
+        __m256i v_src = _mm256_set1_epi32(paired_source);
+        __m256i v_coeff_0 = _mm256_load_si256((__m256i*) coeff_start);
+        coeff_start += 16;
+        __m256i v_coeff_1 = _mm256_load_si256((__m256i*) coeff_start);
+        coeff_start += 16;
+        __m256i v_coeff_2 = _mm256_load_si256((__m256i*) coeff_start);
+        coeff_start += 16;
+        __m256i v_coeff_3 = _mm256_load_si256((__m256i*) coeff_start);
+        coeff_start += 16;
+
+        __m256i madd_0 = _mm256_madd_epi16(v_src, v_coeff_0);
+        __m256i madd_1 = _mm256_madd_epi16(v_src, v_coeff_1);
+        __m256i madd_2 = _mm256_madd_epi16(v_src, v_coeff_2);
+        __m256i madd_3 = _mm256_madd_epi16(v_src, v_coeff_3);
+
+        res_0 = _mm256_add_epi32(res_0, madd_0);
+        res_1 = _mm256_add_epi32(res_1, madd_1);
+        res_2 = _mm256_add_epi32(res_2, madd_2);
+        res_3 = _mm256_add_epi32(res_3, madd_3);
+      }
+      __m256i v_trunc_0 = truncate_avx2(res_0, debias, shift_2nd);
+      __m256i v_trunc_1 = truncate_avx2(res_1, debias, shift_2nd);
+      __m256i v_trunc_2 = truncate_avx2(res_2, debias, shift_2nd);
+      __m256i v_trunc_3 = truncate_avx2(res_3, debias, shift_2nd);
+
+      v_trunc_0 = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+      v_trunc_1 = _mm256_packs_epi32(v_trunc_2, v_trunc_3);
+      v_trunc_0 = _mm256_permute4x64_epi64(v_trunc_0, _MM_SHUFFLE(3, 1, 2, 0));
+      v_trunc_1 = _mm256_permute4x64_epi64(v_trunc_1, _MM_SHUFFLE(3, 1, 2, 0));
+      _mm256_store_si256(temp_out + 2 * j, v_trunc_0);
+      _mm256_store_si256(temp_out + 2 * j + 1, v_trunc_1);
+    }
+    transpose_avx2(temp_out, (__m256i*) dst, 32, 16);
+  }
+  else {
+    for (int j = 0; j < 16; ++j) {
+    __m256i res_0 = _mm256_setzero_si256();
+    __m256i res_1 = _mm256_setzero_si256();
+    const int16_t* coeff_start = ver_coeff;
+    for (int i = 0; i < 16; ++i) {
+      int16_t source[2];
+      source[0] = v_hor_pass_out[j + i * 32];
+      source[1] = v_hor_pass_out[j + i * 32 + 16];
+      int32_t paired_source;
+      memcpy(&paired_source, source, sizeof(int32_t));
+
+      __m256i v_src = _mm256_set1_epi32(paired_source);
+      __m256i v_coeff_0 = _mm256_load_si256((__m256i*) coeff_start);
+      coeff_start += 16;
+      __m256i v_coeff_1 = _mm256_load_si256((__m256i*) coeff_start);
+      coeff_start += 48;
+
+      __m256i madd_0 = _mm256_madd_epi16(v_src, v_coeff_0);
+      __m256i madd_1 = _mm256_madd_epi16(v_src, v_coeff_1);
+
+      res_0 = _mm256_add_epi32(res_0, madd_0);
+      res_1 = _mm256_add_epi32(res_1, madd_1);
+    }
+    __m256i v_trunc_0 = truncate_avx2(res_0, debias, shift_2nd);
+    __m256i v_trunc_1 = truncate_avx2(res_1, debias, shift_2nd);
+
+    v_trunc_0 = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+    v_trunc_0 = _mm256_permute4x64_epi64(v_trunc_0, _MM_SHUFFLE(3, 1, 2, 0));
+    _mm256_store_si256(temp_out + 2 * j, v_trunc_0);
+  }
+  transpose_avx2(temp_out, (__m256i*) dst, 32, 16);
+  }
+#if 0
+  // To how many parts the vertical pass should be split.
+  // At least on my testing it seems that there is no further gain by splitting to more than 4 parts.
+#define NUM_PARTS 4
+#define PART_DIMENSION (32/NUM_PARTS)
+  for (int part = 0; part < NUM_PARTS; ++part) {
+    // Got 32 / NUM_PARTS lines of samples. Handle two lines at a time (beacuse of unpack)
+    __m256i v_madd_hi[16][PART_DIMENSION];
+    __m256i v_madd_lo[16][PART_DIMENSION];
+    // Samples are the same between the parts
+    __m256i* v_src_ptr = v_hor_pass_out;
+    // However for coefficients, the starting point needs to be adjusted
+    const int32_t* line_coeff = (const int32_t*)ver_coeff + PART_DIMENSION * part;
+    for (int i = 0; i < 16; ++i) {
+      __m256i v_src_hi = _mm256_unpackhi_epi16(v_src_ptr[0], v_src_ptr[1]);
+      __m256i v_src_lo = _mm256_unpacklo_epi16(v_src_ptr[0], v_src_ptr[1]);
+
+      // Apply coefficients
+      // TODO: Here try loading the coefficient directly instead of set1
+      for (int ii = 0; ii < PART_DIMENSION; ++ii) {
+        const int32_t coeff = line_coeff[ii];
+        const __m256i v_coeff = _mm256_set1_epi32(coeff);
+        v_madd_hi[i][ii] = _mm256_madd_epi16(v_src_hi, v_coeff);
+        v_madd_lo[i][ii] = _mm256_madd_epi16(v_src_lo, v_coeff);
+      }
+
+      line_coeff += 32;
+      v_src_ptr += 2;
+    }
+
+    for (int ii = 0; ii < PART_DIMENSION; ++ii) {
+      // First round of additions
+      __m256i v_add_hi_0[8];
+      __m256i v_add_lo_0[8];
+      for (int i = 0; i < 8; ++i) {
+        const int offset = i * 2;
+        v_add_hi_0[i] = _mm256_add_epi32(v_madd_hi[offset][ii], v_madd_hi[offset + 1][ii]);
+        v_add_lo_0[i] = _mm256_add_epi32(v_madd_lo[offset][ii], v_madd_lo[offset + 1][ii]);
+      }
+
+      // Second round of additions
+      __m256i v_add_hi_1[4];
+      __m256i v_add_lo_1[4];
+      for (int i = 0; i < 4; ++i) {
+        const int offset = i * 2;
+        v_add_hi_1[i] = _mm256_add_epi32(v_add_hi_0[offset], v_add_hi_0[offset + 1]);
+        v_add_lo_1[i] = _mm256_add_epi32(v_add_lo_0[offset], v_add_lo_0[offset + 1]);
+      }
+
+      // Third round of addtions
+      __m256i v_add_hi_2[2];
+      __m256i v_add_lo_2[2];
+      for (int i = 0; i < 2; ++i) {
+        const int offset = i * 2;
+        v_add_hi_2[i] = _mm256_add_epi32(v_add_hi_1[offset], v_add_hi_1[offset + 1]);
+        v_add_lo_2[i] = _mm256_add_epi32(v_add_lo_1[offset], v_add_lo_1[offset + 1]);
+      }
+
+      // Final round of additions, truncate and store
+      __m256i v_trunc_hi = truncate_avx2(_mm256_add_epi32(v_add_hi_2[0], v_add_hi_2[1]), debias, shift_2nd);
+      __m256i v_trunc_lo = truncate_avx2(_mm256_add_epi32(v_add_lo_2[0], v_add_lo_2[1]), debias, shift_2nd);
+      __m256i v_result = _mm256_packs_epi32(v_trunc_lo, v_trunc_hi);
+      _mm256_store_si256((__m256i*)dst, v_result);
+
+      dst += 16;
+    }
+  }
+#undef NUM_PARTS
+#undef PART_DIMENSION
+#endif
+
+  if (skip_width) {
+    dst = p_dst + reduced_line;
+    for (int j = 0; j < cutoff; j++)
+    {
+      memset(dst, 0, sizeof(int16_t) * skip_width);
+      dst += width;
+    }
+  }
+
+  if (skip_height) {
+    dst = p_dst + width * cutoff;
+    memset(dst, 0, sizeof(int16_t) * width * skip_height);
+  }
+}
+
+
+static void fast_inverse_tr_16x32_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const int32_t* c_ptr = (int32_t*)coeff; // Handle as 32 bit integer to load two coeffs into vectors at a time
+  const __m256i* v_src_raw = (const __m256i*)src;
+
+  __m256i v_tmp16_lo[16];
+  __m256i v_tmp16_hi[16];
+  for (int d = 0, s = 0; d < 16; ++d, s += 2) {
+    v_tmp16_lo[d] = _mm256_unpacklo_epi16(v_src_raw[s + 0], v_src_raw[s + 1]);
+    v_tmp16_hi[d] = _mm256_unpackhi_epi16(v_src_raw[s + 0], v_src_raw[s + 1]);
+  }
+  int  row = 0;
+  for (; row < 32 - skip_line2; ++row) {
+    __m256i v_res_lo = _mm256_setzero_si256();
+    __m256i v_res_hi = _mm256_setzero_si256();
+    for (int i = 0; i < 16; ++i) {
+      const __m256i v_coeff = _mm256_set1_epi32(*c_ptr);
+      __m256i v_madd_lo = _mm256_madd_epi16(v_tmp16_lo[i], v_coeff);
+      __m256i v_madd_hi = _mm256_madd_epi16(v_tmp16_hi[i], v_coeff);
+      c_ptr++;
+
+      v_res_lo = _mm256_add_epi32(v_res_lo, v_madd_lo);
+      v_res_hi = _mm256_add_epi32(v_res_hi, v_madd_hi);
+    }
+
+    __m256i v_trunc_lo = truncate_avx2(v_res_lo, debias, shift);
+    __m256i v_trunc_hi = truncate_avx2(v_res_hi, debias, shift);
+
+    __m256i packed = _mm256_packs_epi32(v_trunc_lo, v_trunc_hi);
+    dst[row] = packed;
+  }
+
+  for (; row < 32; ++row) {
+    dst[row] = _mm256_setzero_si256();
+  }
+}
+
+static void fast_inverse_tr_16x32_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  int32_t * src_32 = (int32_t *)src;
+  for (int row = 0, d = 0; row < 32; ++row) {
+    __m256i v_res_0 = _mm256_setzero_si256();
+    __m256i v_res_1 = _mm256_setzero_si256();
+    __m256i *coeff_start = (__m256i*) coeff;
+    for (int i = 0; i < 8; ++i) {
+      __m256i v_src = _mm256_set1_epi32(*src_32);
+      src_32++;
+
+      __m256i v_madd_0 = _mm256_madd_epi16(v_src, _mm256_load_si256(coeff_start));
+      coeff_start++;
+      __m256i v_madd_1 = _mm256_madd_epi16(v_src, _mm256_load_si256(coeff_start));
+      coeff_start++;
+
+      v_res_0 = _mm256_add_epi32(v_res_0, v_madd_0);
+      v_res_1 = _mm256_add_epi32(v_res_1, v_madd_1);
+    }
+
+    __m256i v_trunc_0 = truncate_avx2(v_res_0, debias, shift);
+    __m256i v_trunc_1 = truncate_avx2(v_res_1, debias, shift);
+
+    __m256i packed = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+    packed = _mm256_permute4x64_epi64(packed, _MM_SHUFFLE(3, 1, 2, 0));
+    _mm256_store_si256((__m256i*) dst + row, packed);
+  }
+}
+
+void fast_inverse_tr_16x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 16;
+  const int height = 32;
+
+  int skip_width = 0;
+  int skip_height = (ver != DCT2 && height == 32) ? 16 : (height > 32 ? height - 32 : 0);
+
+  const int32_t shift_1st = INVERSE_SHIFT_1ST;
+  const int32_t shift_2nd = INVERSE_SHIFT_2ND;
+
+  const int16_t* ver_coeff = uvg_g_dct_32_t;
+  const int16_t* hor_coeff = fi_dct2_16x16_coeff_hor;
+  if (hor == DST7) {
+    hor_coeff = fi_dst7_16x32_coeff_hor; // TODO: coeffs
+  } else if (hor == DCT8) {
+    hor_coeff = fi_dct8_16x32_coeff_hor;
+  }
+  if (ver == DST7) {
+    ver_coeff = uvg_g_dst7_32_t;
+  } else if (ver == DCT8) {
+    ver_coeff = uvg_g_dct8_32;
+  }
+
+  __m256i v_ver_pass_out[32];
+  fast_inverse_tr_16x32_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height);
+  int16_t* ver_pass_out = (int16_t*)v_ver_pass_out;
+  fast_inverse_tr_16x32_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width);
+}
+
+
+static void fast_forward_DCT2_B32_avx2_hor(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) {
+
+  const int32_t    add = (shift > 0) ? (1 << (shift - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const int reduced_line = line - skip_line;
+
+  for(int j = 0; j < reduced_line; ++j) {
+    int32_t source[16];
+    memcpy(source, src, sizeof(int16_t) * 32);
+    src += 32;
+
+    __m256i res_0 = _mm256_setzero_si256();
+    __m256i res_1 = _mm256_setzero_si256();
+    __m256i res_2 = _mm256_setzero_si256();
+    __m256i res_3 = _mm256_setzero_si256();
+    const int16_t *coeff_start = coeff;
+    for(int i = 0; i < 16; i++) {
+      __m256i v_src = _mm256_set1_epi32(source[i]);
+      __m256i v_coeff_0 = _mm256_load_si256((__m256i*) coeff_start);
+      coeff_start += 16;
+      __m256i v_coeff_1 = _mm256_load_si256((__m256i*) coeff_start);
+      coeff_start += 16;
+      __m256i v_coeff_2 = _mm256_load_si256((__m256i*) coeff_start);
+      coeff_start += 16;
+      __m256i v_coeff_3 = _mm256_load_si256((__m256i*) coeff_start);
+      coeff_start += 16;
+
+      __m256i madd_0 = _mm256_madd_epi16(v_src, v_coeff_0);
+      __m256i madd_1 = _mm256_madd_epi16(v_src, v_coeff_1);
+      __m256i madd_2 = _mm256_madd_epi16(v_src, v_coeff_2);
+      __m256i madd_3 = _mm256_madd_epi16(v_src, v_coeff_3);
+
+      res_0 = _mm256_add_epi32(res_0, madd_0);
+      res_1 = _mm256_add_epi32(res_1, madd_1);
+      res_2 = _mm256_add_epi32(res_2, madd_2);
+      res_3 = _mm256_add_epi32(res_3, madd_3);
+    }
+    __m256i v_trunc_0 = truncate_avx2(res_0, debias, shift);
+    __m256i v_trunc_1 = truncate_avx2(res_1, debias, shift);
+    __m256i v_trunc_2 = truncate_avx2(res_2, debias, shift);
+    __m256i v_trunc_3 = truncate_avx2(res_3, debias, shift);
+
+    v_trunc_0 = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+    v_trunc_1 = _mm256_packs_epi32(v_trunc_2, v_trunc_3);
+
+    if(line == 32) {
+      v_trunc_0 = _mm256_permute4x64_epi64(v_trunc_0, _MM_SHUFFLE(3, 1, 2, 0));
+      v_trunc_1 = _mm256_permute4x64_epi64(v_trunc_1, _MM_SHUFFLE(3, 1, 2, 0));
+    }
+
+    _mm256_store_si256(dst, v_trunc_0);
+    dst++;
+    _mm256_store_si256(dst, v_trunc_1);
+    dst++;
+  }
+}
+
+static void fast_forward_DCT8_B32_avx2_hor(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) {
+  const int32_t    add = (shift > 0) ? (1 << (shift - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const int  cutoff = 32 - skip_line2;
+  const int reduced_line = line - skip_line;
+
+  ALIGNED(32) int16_t temp_source[32 * 32];
+  __m256i* v_src_p = (__m256i*) src;
+  for (int i = 0; i < reduced_line / 2; ++i) {
+    __m256i first_half_lo = _mm256_unpacklo_epi32(v_src_p[i * 4], v_src_p[i * 4 + 2]);
+    __m256i first_half_hi = _mm256_unpackhi_epi32(v_src_p[i * 4], v_src_p[i * 4 + 2]);
+    __m256i second_half_lo = _mm256_unpacklo_epi32(v_src_p[i * 4 + 1], v_src_p[i * 4 + 3]);
+    __m256i second_half_hi = _mm256_unpackhi_epi32(v_src_p[i * 4 + 1], v_src_p[i * 4 + 3]);
+
+    _mm256_store_si256((__m256i*)temp_source + i * 4, first_half_lo);
+    _mm256_store_si256((__m256i*)temp_source + i * 4 + 1, first_half_hi);
+    _mm256_store_si256((__m256i*)temp_source + i * 4 + 2, second_half_lo);
+    _mm256_store_si256((__m256i*)temp_source + i * 4 + 3, second_half_hi);
+  }
+
+  for (int j = 0; j < reduced_line / 2; ++j) {
+
+    int32_t source[32];
+    memcpy(source, temp_source + 64 * j, sizeof(int16_t) * 64);
+
+    __m256i res_0 = _mm256_setzero_si256();
+    __m256i res_1 = _mm256_setzero_si256();
+    __m256i res_2 = _mm256_setzero_si256();
+    __m256i res_3 = _mm256_setzero_si256();
+    const int16_t* coeff_start = coeff;
+
+    for (int i = 0; i < 32; i += 2) {
+      __m256i v_src0 = _mm256_set1_epi32(source[i]);      
+      __m256i v_src1 = _mm256_set1_epi32(source[i + 1]);
+
+      __m256i v_coeff_0 = _mm256_load_si256((__m256i*) coeff_start);
+      coeff_start += 16;
+      __m256i v_coeff_1 = _mm256_load_si256((__m256i*) coeff_start);
+      coeff_start += 48;
+
+      __m256i madd_0 = _mm256_madd_epi16(v_src0, v_coeff_0);
+      __m256i madd_1 = _mm256_madd_epi16(v_src0, v_coeff_1);
+      __m256i madd_2 = _mm256_madd_epi16(v_src1, v_coeff_0);
+      __m256i madd_3 = _mm256_madd_epi16(v_src1, v_coeff_1);
+
+      res_0 = _mm256_add_epi32(madd_0, res_0);
+      res_1 = _mm256_add_epi32(madd_1, res_1);
+      res_2 = _mm256_add_epi32(madd_2, res_2);
+      res_3 = _mm256_add_epi32(madd_3, res_3);
+    }
+    __m256i v_trunc_0 = truncate_avx2(res_0, debias, shift);
+    __m256i v_trunc_1 = truncate_avx2(res_1, debias, shift);
+    __m256i v_trunc_2 = truncate_avx2(res_2, debias, shift);
+    __m256i v_trunc_3 = truncate_avx2(res_3, debias, shift);
+    
+    v_trunc_0 = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+    v_trunc_2 = _mm256_packs_epi32(v_trunc_2, v_trunc_3);
+
+    if (line == 32) {
+      v_trunc_0 = _mm256_permute4x64_epi64(v_trunc_0, _MM_SHUFFLE(3, 1, 2, 0));
+      v_trunc_2 = _mm256_permute4x64_epi64(v_trunc_2, _MM_SHUFFLE(3, 1, 2, 0));
+    }
+    _mm256_store_si256(dst, v_trunc_0);
+    dst+=2;
+    _mm256_store_si256(dst, v_trunc_2);
+    dst+=2;
+  }
+}
+
+
+static void fast_forward_DCT2_32x2_avx2_ver(const __m256i* src, int16_t* dst, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = (shift > 0) ? (1 << (shift - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_src_ptr = src;
+
+  // Prepare coeffs
+  // TODO: either rename these old coeff tables to be consistent with other new avx2 functions
+  // or construct them here in place. Should be ease to accomplish with set1_epi32, just use a int32_t combined from two int16_t
+  const __m256i v_coeff_0 = _mm256_load_si256((const __m256i*)fast_forward_dct2_b2_coeff[0]);
+  const __m256i v_coeff_1 = _mm256_load_si256((const __m256i*)fast_forward_dct2_b2_coeff[16]);
+  
+  // Got data for 4 vectors, 32 lines with 2 samples each
+  __m256i v_result_e[4];
+  __m256i v_result_o[4];
+  for (int j = 0; j < 4; ++j) {
+    const __m256i v_src = v_src_ptr[0];
+
+    v_result_e[j] = truncate_avx2(_mm256_madd_epi16(v_src, v_coeff_0), debias, shift);
+    v_result_o[j] = truncate_avx2(_mm256_madd_epi16(v_src, v_coeff_1), debias, shift);
+
+    v_src_ptr++;
+  }
+
+  __m256i v_tmp[4];
+  v_tmp[0] = _mm256_packs_epi32(v_result_e[0], v_result_e[1]);
+  v_tmp[1] = _mm256_packs_epi32(v_result_e[2], v_result_e[3]);
+  v_tmp[2] = _mm256_packs_epi32(v_result_o[0], v_result_o[1]);
+  v_tmp[3] = _mm256_packs_epi32(v_result_o[2], v_result_o[3]);
+
+  v_tmp[0] = _mm256_permute4x64_epi64(v_tmp[0], _MM_SHUFFLE(3, 1, 2, 0));
+  v_tmp[1] = _mm256_permute4x64_epi64(v_tmp[1], _MM_SHUFFLE(3, 1, 2, 0));
+  v_tmp[2] = _mm256_permute4x64_epi64(v_tmp[2], _MM_SHUFFLE(3, 1, 2, 0));
+  v_tmp[3] = _mm256_permute4x64_epi64(v_tmp[3], _MM_SHUFFLE(3, 1, 2, 0));
+
+  for (int i = 0; i < 4; ++i) {
+    _mm256_store_si256((__m256i*)&dst[i * 16], v_tmp[i]);
+  }
+}
+
+static void fast_forward_DCT2_32x4_avx2_ver(const __m256i* src, int16_t* dst, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = (shift > 0) ? (1 << (shift - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  // Got data for 8 vectors, 32 lines with 4 samples each
+
+  // Prepare coeffs
+  const int16_t* coeff = uvg_g_dct_4;
+  const int a = coeff[0];
+  const int b = coeff[1 * 4 + 0];
+  const int c = coeff[1 * 4 + 1];
+
+  __m256i v_coeff_0 = _mm256_set1_epi16(a);
+  __m256i v_coeff_1 = _mm256_setr_epi16(b, c, -c, -b, b, c, -c, -b, b, c, -c, -b, b, c, -c, -b);
+  __m256i v_coeff_2 = _mm256_setr_epi16(a, -a, -a, a, a, -a, -a, a, a, -a, -a, a, a, -a, -a, a);
+  __m256i v_coeff_3 = _mm256_setr_epi16(c, -b, b, -c, c, -b, b, -c, c, -b, b, -c, c, -b, b, -c);
+
+  const __m256i* v_src_ptr = src;
+  __m256i v_trunc_0[8];
+  __m256i v_trunc_1[8];
+  for (int j = 0; j < 8; ++j) {
+    __m256i v_madd_0 = _mm256_madd_epi16(v_src_ptr[0], v_coeff_0);
+    __m256i v_madd_1 = _mm256_madd_epi16(v_src_ptr[0], v_coeff_1);
+    __m256i v_madd_2 = _mm256_madd_epi16(v_src_ptr[0], v_coeff_2);
+    __m256i v_madd_3 = _mm256_madd_epi16(v_src_ptr[0], v_coeff_3);
+
+    v_trunc_0[j] = truncate_avx2(_mm256_hadd_epi32(v_madd_0, v_madd_1), debias, shift);
+    v_trunc_1[j] = truncate_avx2(_mm256_hadd_epi32(v_madd_2, v_madd_3), debias, shift);
+
+    v_src_ptr++;
+  }
+
+  __m256i v_result[8];
+  __m256i v_tmp[8];
+  for (int i = 0; i < 8; ++i) {
+    v_trunc_0[i] = _mm256_permute4x64_epi64(v_trunc_0[i], _MM_SHUFFLE(3, 1, 2, 0));
+    v_trunc_1[i] = _mm256_permute4x64_epi64(v_trunc_1[i], _MM_SHUFFLE(3, 1, 2, 0));
+  }
+  v_tmp[0] = _mm256_packs_epi32(v_trunc_0[0], v_trunc_0[1]);
+  v_tmp[1] = _mm256_packs_epi32(v_trunc_0[2], v_trunc_0[3]);
+  v_tmp[2] = _mm256_packs_epi32(v_trunc_0[4], v_trunc_0[5]);
+  v_tmp[3] = _mm256_packs_epi32(v_trunc_0[6], v_trunc_0[7]);
+  v_tmp[4] = _mm256_packs_epi32(v_trunc_1[0], v_trunc_1[1]);
+  v_tmp[5] = _mm256_packs_epi32(v_trunc_1[2], v_trunc_1[3]);
+  v_tmp[6] = _mm256_packs_epi32(v_trunc_1[4], v_trunc_1[5]);
+  v_tmp[7] = _mm256_packs_epi32(v_trunc_1[6], v_trunc_1[7]);
+
+  v_result[0] = _mm256_permute2x128_si256(v_tmp[0], v_tmp[1], 0x20);
+  v_result[1] = _mm256_permute2x128_si256(v_tmp[2], v_tmp[3], 0x20);
+  v_result[2] = _mm256_permute2x128_si256(v_tmp[0], v_tmp[1], 0x31);
+  v_result[3] = _mm256_permute2x128_si256(v_tmp[2], v_tmp[3], 0x31);
+
+  v_result[4] = _mm256_permute2x128_si256(v_tmp[4], v_tmp[5], 0x20);
+  v_result[5] = _mm256_permute2x128_si256(v_tmp[6], v_tmp[7], 0x20);
+  v_result[6] = _mm256_permute2x128_si256(v_tmp[4], v_tmp[5], 0x31);
+  v_result[7] = _mm256_permute2x128_si256(v_tmp[6], v_tmp[7], 0x31);
+
+  for (int i = 0; i < 8; ++i) {
+    _mm256_store_si256((__m256i*)&dst[i * 16], v_result[i]);
+  }
+}
+
+
+static void fast_forward_DCT2_32x8_avx2_ver(const __m256i* src, int16_t* dst, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  int16_t* const p_dst = dst;
+  const int32_t    add = (shift > 0) ? (1 << (shift - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  // Re-use coeff table
+  const __m256i* v_coeff = (const __m256i*)ff_dct2_16x8_coeff_ver;
+
+  const int reduced_line = line - skip_line;
+  const __m256i* v_src_ptr = src;
+  __m256i v_tmp_result[16];
+  // Handle 2 lines at a time (16 samples, 8 samples per line)
+  for (int i = 0; i < 16; ++i) {
+    //                    line 1                    line 2
+    // src vector:       [s0 s1 s2 s3 s4 s5 s6 s7 | s0 s1 s2 s3 s4 s5 s6 s7]
+    // __m256i    v_src = _mm256_load_si256((const __m256i*)src);
+
+    // Rearrange source in a way samples can be added together column-wise using add
+    // after first round of madd operations.
+    // Need 4 source vectors arranged as follows. High 128 lanes are the same as low:
+    // vec_01 = [s0 s1 s0 s1 s0 s1 s0 s1 |...]
+    // vec_02 = [s2 s3 s2 s3 s2 s3 s2 s3 |...]
+    // vec_03 = [s4 s5 s4 s5 s4 s5 s4 s5 |...]
+    // vec_04 = [s6 s7 s6 s7 s6 s7 s6 s7 |...]
+
+    __m256i  v_src_0 = _mm256_shuffle_epi32(v_src_ptr[0], _MM_SHUFFLE(0, 0, 0, 0));
+    __m256i  v_src_1 = _mm256_shuffle_epi32(v_src_ptr[0], _MM_SHUFFLE(1, 1, 1, 1));
+    __m256i  v_src_2 = _mm256_shuffle_epi32(v_src_ptr[0], _MM_SHUFFLE(2, 2, 2, 2));
+    __m256i  v_src_3 = _mm256_shuffle_epi32(v_src_ptr[0], _MM_SHUFFLE(3, 3, 3, 3));
+
+    // Lane 1
+    __m256i v_madd_0 = _mm256_madd_epi16(v_src_0, v_coeff[0]);
+    __m256i v_madd_1 = _mm256_madd_epi16(v_src_1, v_coeff[1]);
+    __m256i v_madd_2 = _mm256_madd_epi16(v_src_2, v_coeff[2]);
+    __m256i v_madd_3 = _mm256_madd_epi16(v_src_3, v_coeff[3]);
+
+    __m256i v_add_00 = _mm256_add_epi32(v_madd_0, v_madd_1);
+    __m256i v_add_01 = _mm256_add_epi32(v_madd_2, v_madd_3);
+
+    __m256i v_add_10 = _mm256_add_epi32(v_add_00, v_add_01);
+
+    // Lane 2
+    __m256i v_madd_4 = _mm256_madd_epi16(v_src_0, v_coeff[4]);
+    __m256i v_madd_5 = _mm256_madd_epi16(v_src_1, v_coeff[5]);
+    __m256i v_madd_6 = _mm256_madd_epi16(v_src_2, v_coeff[6]);
+    __m256i v_madd_7 = _mm256_madd_epi16(v_src_3, v_coeff[7]);
+
+    __m256i v_add_02 = _mm256_add_epi32(v_madd_4, v_madd_5);
+    __m256i v_add_03 = _mm256_add_epi32(v_madd_6, v_madd_7);
+
+    __m256i v_add_11 = _mm256_add_epi32(v_add_02, v_add_03);
+
+    // Trunc results from both lanes
+    __m256i v_trunc_0 = truncate_avx2(v_add_10, debias, shift);
+    __m256i v_trunc_1 = truncate_avx2(v_add_11, debias, shift);
+
+    v_tmp_result[i] = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+
+    v_src_ptr++;
+  }
+
+  __m256i v_result[16];
+  transpose_avx2(v_tmp_result, v_result, 8, 32);
+
+  for (int i = 0; i < 16; ++i) {
+    _mm256_store_si256((__m256i*)dst, v_result[i]);
+    dst += 16;
+  }
+
+  if (skip_line)
+  {
+    dst = p_dst + reduced_line;
+    for (int j = 0; j < 8; j++)
+    {
+      memset(dst, 0, sizeof(int16_t) * skip_line);
+      dst += line;
+    }
+  }
+}
+
+
+void fast_forward_tr_32x2_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 32;
+  const int height = 2;
+  
+  int skip_width  = (hor != DCT2 && width == 32) ? 16 : width > 32 ? width - 32 : 0;
+  int skip_height = 0;
+
+  const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1;
+  const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
+  const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8;
+  const int32_t shift_2nd = log2_height_minus1 + 7;
+
+  const int16_t* hor_coeff = ff_dct2_32xN_coeff_hor;
+  const int16_t* ver_coeff = ff_dct2_32x2_coeff_ver;
+
+  __m256i v_hor_pass_out[4];
+  fast_forward_DCT2_B32_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);
+
+  // Vertical pass
+  const int32_t    add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)ver_coeff;
+
+  // Got samples for 4 source vectors, 2 lines 32 samples each
+  __m256i v_src_hi_0 = _mm256_unpackhi_epi16(v_hor_pass_out[0], v_hor_pass_out[2]);
+  __m256i v_src_lo_0 = _mm256_unpacklo_epi16(v_hor_pass_out[0], v_hor_pass_out[2]);
+  __m256i v_src_hi_1 = _mm256_unpackhi_epi16(v_hor_pass_out[1], v_hor_pass_out[3]);
+  __m256i v_src_lo_1 = _mm256_unpacklo_epi16(v_hor_pass_out[1], v_hor_pass_out[3]);
+  
+  __m256i v_madd_hi_00 = _mm256_madd_epi16(v_src_hi_0, v_coeff[0]);
+  __m256i v_madd_hi_01 = _mm256_madd_epi16(v_src_hi_0, v_coeff[1]);
+  __m256i v_madd_hi_10 = _mm256_madd_epi16(v_src_hi_1, v_coeff[0]);
+  __m256i v_madd_hi_11 = _mm256_madd_epi16(v_src_hi_1, v_coeff[1]);
+
+  __m256i v_madd_lo_00 = _mm256_madd_epi16(v_src_lo_0, v_coeff[0]);
+  __m256i v_madd_lo_01 = _mm256_madd_epi16(v_src_lo_0, v_coeff[1]);
+  __m256i v_madd_lo_10 = _mm256_madd_epi16(v_src_lo_1, v_coeff[0]);
+  __m256i v_madd_lo_11 = _mm256_madd_epi16(v_src_lo_1, v_coeff[1]);
+
+  __m256i v_trunc_hi_00 = truncate_avx2(v_madd_hi_00, debias, shift_2nd);
+  __m256i v_trunc_hi_01 = truncate_avx2(v_madd_hi_01, debias, shift_2nd);
+  __m256i v_trunc_hi_10 = truncate_avx2(v_madd_hi_10, debias, shift_2nd);
+  __m256i v_trunc_hi_11 = truncate_avx2(v_madd_hi_11, debias, shift_2nd);
+
+  __m256i v_trunc_lo_00 = truncate_avx2(v_madd_lo_00, debias, shift_2nd);
+  __m256i v_trunc_lo_01 = truncate_avx2(v_madd_lo_01, debias, shift_2nd);
+  __m256i v_trunc_lo_10 = truncate_avx2(v_madd_lo_10, debias, shift_2nd);
+  __m256i v_trunc_lo_11 = truncate_avx2(v_madd_lo_11, debias, shift_2nd);
+
+  __m256i v_result_0 = _mm256_packs_epi32(v_trunc_lo_00, v_trunc_hi_00);
+  __m256i v_result_1 = _mm256_packs_epi32(v_trunc_lo_10, v_trunc_hi_10); // Swap middle hi-lo lanes
+  __m256i v_result_2 = _mm256_packs_epi32(v_trunc_lo_01, v_trunc_hi_01);
+  __m256i v_result_3 = _mm256_packs_epi32(v_trunc_lo_11, v_trunc_hi_11);
+
+  // Swap middle 64-bit chunks
+  v_result_0 = _mm256_permute4x64_epi64(v_result_0, _MM_SHUFFLE(3, 1, 2, 0));
+  v_result_1 = _mm256_permute4x64_epi64(v_result_1, _MM_SHUFFLE(3, 1, 2, 0));
+  v_result_2 = _mm256_permute4x64_epi64(v_result_2, _MM_SHUFFLE(3, 1, 2, 0));
+  v_result_3 = _mm256_permute4x64_epi64(v_result_3, _MM_SHUFFLE(3, 1, 2, 0));
+
+  _mm256_store_si256((__m256i*)dst, v_result_0);
+  _mm256_store_si256((__m256i*)(dst + 16), v_result_1);
+  _mm256_store_si256((__m256i*)(dst + 32), v_result_2);
+  _mm256_store_si256((__m256i*)(dst + 48), v_result_3);
+}
+
+
+static void fast_inverse_tr_32x2_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i v_coeff_0 = _mm256_load_si256((const __m256i*) & coeff[0]);
+  const __m256i v_coeff_1 = _mm256_load_si256((const __m256i*) & coeff[16]);
+
+  const __m256i* v_src = (const __m256i*)src;
+
+  __m256i v_src_lo_0 = _mm256_unpacklo_epi16(v_src[0], v_src[2]);
+  __m256i v_src_lo_1 = _mm256_unpacklo_epi16(v_src[1], v_src[3]);
+  __m256i v_src_hi_0 = _mm256_unpackhi_epi16(v_src[0], v_src[2]);
+  __m256i v_src_hi_1 = _mm256_unpackhi_epi16(v_src[1], v_src[3]);
+
+  __m256i v_trunc_lo_00 = truncate_avx2(_mm256_madd_epi16(v_src_lo_0, v_coeff_0), debias, shift);
+  __m256i v_trunc_lo_01 = truncate_avx2(_mm256_madd_epi16(v_src_lo_0, v_coeff_1), debias, shift);
+  __m256i v_trunc_lo_10 = truncate_avx2(_mm256_madd_epi16(v_src_lo_1, v_coeff_0), debias, shift);
+  __m256i v_trunc_lo_11 = truncate_avx2(_mm256_madd_epi16(v_src_lo_1, v_coeff_1), debias, shift);
+
+  __m256i v_trunc_hi_00 = truncate_avx2(_mm256_madd_epi16(v_src_hi_0, v_coeff_0), debias, shift);
+  __m256i v_trunc_hi_01 = truncate_avx2(_mm256_madd_epi16(v_src_hi_0, v_coeff_1), debias, shift);
+  __m256i v_trunc_hi_10 = truncate_avx2(_mm256_madd_epi16(v_src_hi_1, v_coeff_0), debias, shift);
+  __m256i v_trunc_hi_11 = truncate_avx2(_mm256_madd_epi16(v_src_hi_1, v_coeff_1), debias, shift);
+
+  __m256i v_tmp0 = _mm256_packs_epi32(v_trunc_lo_00, v_trunc_lo_01);
+  __m256i v_tmp1 = _mm256_packs_epi32(v_trunc_hi_00, v_trunc_hi_01);
+  __m256i v_tmp2 = _mm256_packs_epi32(v_trunc_lo_10, v_trunc_lo_11);
+  __m256i v_tmp3 = _mm256_packs_epi32(v_trunc_hi_10, v_trunc_hi_11);
+
+  dst[0] = _mm256_permute2x128_si256(v_tmp0, v_tmp1, 0x20);
+  dst[1] = _mm256_permute2x128_si256(v_tmp0, v_tmp1, 0x31);
+  dst[2] = _mm256_permute2x128_si256(v_tmp2, v_tmp3, 0x20);
+  dst[3] = _mm256_permute2x128_si256(v_tmp2, v_tmp3, 0x31);
+}
+
+static void fast_inverse_tr_32x2_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = (shift > 0) ? (1 << (shift - 1)) : 0;
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+
+  __m256i v_src[4];
+  for (int i = 0; i < 4; ++i) {
+    v_src[i] = _mm256_permute4x64_epi64(src[i], _MM_SHUFFLE(3, 1, 2, 0));
+  }
+
+  __m256i v_add[32];
+  for (int i = 0; i < 32; ++i) {
+    __m256i v_madd_0 = _mm256_madd_epi16(v_src[0], v_coeff[0]);
+    __m256i v_madd_1 = _mm256_madd_epi16(v_src[1], v_coeff[1]);
+    __m256i v_madd_2 = _mm256_madd_epi16(v_src[2], v_coeff[2]);
+    __m256i v_madd_3 = _mm256_madd_epi16(v_src[3], v_coeff[3]);
+
+    __m256i v_add_00 = _mm256_add_epi32(v_madd_0, v_madd_1);
+    __m256i v_add_01 = _mm256_add_epi32(v_madd_2, v_madd_3);
+
+    v_add[i] = _mm256_add_epi32(v_add_00, v_add_01);
+    v_coeff += 4;
+  }
+
+  __m256i v_hadd_0[16];
+  for (int src = 0, dst = 0; dst < 16; ++dst, src += 2) {
+    v_hadd_0[dst] = _mm256_hadd_epi32(v_add[src + 0], v_add[src + 1]);
+  }
+
+  __m256i v_hadd_1[8];
+  for (int src = 0, dst = 0; dst < 8; ++dst, src += 2) {
+    v_hadd_1[dst] = _mm256_hadd_epi32(v_hadd_0[src + 0], v_hadd_0[src + 1]);
+  }
+
+  __m256i v_trunc[8];
+  for (int i = 0; i < 8; ++i) {
+    v_trunc[i] = truncate_avx2(v_hadd_1[i], debias, shift);
+  }
+
+  __m256i v_result[4];
+  __m256i v_tmp0 = _mm256_packs_epi32(v_trunc[0], v_trunc[1]);
+  __m256i v_tmp1 = _mm256_packs_epi32(v_trunc[2], v_trunc[3]);
+  __m256i v_tmp2 = _mm256_packs_epi32(v_trunc[4], v_trunc[5]);
+  __m256i v_tmp3 = _mm256_packs_epi32(v_trunc[6], v_trunc[7]);
+
+  v_result[0] = _mm256_permute2x128_si256(v_tmp0, v_tmp1, 0x20);
+  v_result[1] = _mm256_permute2x128_si256(v_tmp2, v_tmp3, 0x20);
+  v_result[2] = _mm256_permute2x128_si256(v_tmp0, v_tmp1, 0x31);
+  v_result[3] = _mm256_permute2x128_si256(v_tmp2, v_tmp3, 0x31);
+
+  for (int i = 0; i < 4; ++i) {
+    _mm256_store_si256((__m256i*)dst, v_result[i]);
+    dst += 16;
+  }
+
+  // TODO: cutoff for DCT8 and DST7
+}
+
+void fast_inverse_tr_32x2_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 32;
+  const int height = 2;
+
+  int skip_width = 0; // DST7 and DCT8 are not defined for this size. Therefore no skip width needed.
+  int skip_height = 0;
+
+  const int32_t shift_1st = INVERSE_SHIFT_1ST;
+  const int32_t shift_2nd = INVERSE_SHIFT_2ND;
+
+  const int16_t* ver_coeff = ff_dct2_2xN_coeff_hor; // TODO: rename
+  const int16_t* hor_coeff = fi_dct2_2x32_coeff_ver; // rename
+  // No DST7 and DCT8 tables needed.
+
+  __m256i v_ver_pass_out[4];
+  fast_inverse_tr_32x2_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height);
+  
+  fast_inverse_tr_32x2_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width);
+}
+
+
+void fast_forward_tr_32x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 32;
+  const int height = 4;
+  
+  int skip_width = (ver != DCT2 && width == 32) ? 16 : width > 32 ? width - 32 : 0;
+  int skip_height = 0;
+
+  const int  reduced_line = width - skip_width;
+  const int  cutoff = height - skip_height;
+  int16_t* p_dst = dst;
+
+  const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1;
+  const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
+  const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8;
+  const int32_t shift_2nd = log2_height_minus1 + 7;
+
+  const int16_t* hor_coeff = ff_dct2_32xN_coeff_hor;
+  const int16_t* ver_coeff = ff_dct2_32x4_coeff_ver;
+  if (hor == DST7) {
+    hor_coeff = ff_dst7_32xN_coeff_hor;
+  } else if (hor == DCT8) {
+    hor_coeff = ff_dct8_32xN_coeff_hor;
+  }
+  if (ver == DST7) {
+    ver_coeff = ff_dst7_32x4_coeff_ver;
+  } else if (ver == DCT8) {
+    ver_coeff = ff_dct8_32x4_coeff_ver;
+  }
+
+  __m256i v_hor_pass_out[8];
+  if(hor == DCT2) {
+    fast_forward_DCT2_B32_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);
+  }
+  else {
+    fast_forward_DCT8_B32_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);    
+  }
+
+  // Vertical pass
+  const int32_t    add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)ver_coeff;
+
+  // Got samples for 8 vectors. 4 lines with 32 samples each. Need 2 vectors for each line
+  // Handle two lines at a time
+  __m256i v_madd_lo_even[2][4];
+  __m256i  v_madd_lo_odd[2][4];
+  __m256i v_madd_hi_even[2][4];
+  __m256i  v_madd_hi_odd[2][4];
+  __m256i* v_src_ptr = v_hor_pass_out;
+  for (int i = 0; i < 2; ++i) {
+    __m256i v_src_hi_0 = _mm256_unpackhi_epi16(v_src_ptr[0], v_src_ptr[2]);
+    __m256i v_src_lo_0 = _mm256_unpacklo_epi16(v_src_ptr[0], v_src_ptr[2]);
+    __m256i v_src_hi_1 = _mm256_unpackhi_epi16(v_src_ptr[1], v_src_ptr[3]);
+    __m256i v_src_lo_1 = _mm256_unpacklo_epi16(v_src_ptr[1], v_src_ptr[3]);
+
+    // Apply coeffs
+    for (int ii = 0; ii < 4; ++ii) {
+      v_madd_lo_even[i][ii] = _mm256_madd_epi16(v_src_lo_0, v_coeff[ii]);
+      v_madd_hi_even[i][ii] = _mm256_madd_epi16(v_src_hi_0, v_coeff[ii]);
+      v_madd_lo_odd[i][ii]  = _mm256_madd_epi16(v_src_lo_1, v_coeff[ii]);
+      v_madd_hi_odd[i][ii]  = _mm256_madd_epi16(v_src_hi_1, v_coeff[ii]);
+    }
+
+    v_coeff += 4;
+    v_src_ptr += 4;
+  }
+
+  // Final add and truncate
+  __m256i v_trunc_lo_even[4];
+  __m256i v_trunc_hi_even[4];
+  __m256i v_trunc_lo_odd[4];
+  __m256i v_trunc_hi_odd[4];
+  for (int ii = 0; ii < 4; ++ii) {
+    v_trunc_lo_even[ii] = truncate_avx2(_mm256_add_epi32(v_madd_lo_even[0][ii], v_madd_lo_even[1][ii]), debias, shift_2nd);
+    v_trunc_lo_odd[ii]  = truncate_avx2(_mm256_add_epi32( v_madd_lo_odd[0][ii],  v_madd_lo_odd[1][ii]), debias, shift_2nd);
+    v_trunc_hi_even[ii] = truncate_avx2(_mm256_add_epi32(v_madd_hi_even[0][ii], v_madd_hi_even[1][ii]), debias, shift_2nd);
+    v_trunc_hi_odd[ii]  = truncate_avx2(_mm256_add_epi32( v_madd_hi_odd[0][ii],  v_madd_hi_odd[1][ii]), debias, shift_2nd);
+  }
+
+  // Permute and store
+  for (int i = 0; i < 4; ++i) {
+    __m256i v_result_even = _mm256_packs_epi32(v_trunc_lo_even[i], v_trunc_hi_even[i]);
+    __m256i v_result_odd  = _mm256_packs_epi32(v_trunc_lo_odd[i], v_trunc_hi_odd[i]);
+    // Flip the middle 64 bit chunks
+    v_result_even = _mm256_permute4x64_epi64(v_result_even, _MM_SHUFFLE(3, 1, 2, 0));
+    v_result_odd = _mm256_permute4x64_epi64(v_result_odd, _MM_SHUFFLE(3, 1, 2, 0));
+    _mm256_store_si256((__m256i*)dst, v_result_even);
+    _mm256_store_si256((__m256i*)(dst + 16), v_result_odd);
+    dst += 32;
+  }
+
+  if (skip_width) {
+    dst = p_dst + reduced_line;
+    for (int j = 0; j < cutoff; j++)
+    {
+      memset(dst, 0, sizeof(int16_t) * skip_width);
+      dst += width;
+    }
+  }
+
+  if (skip_height) {
+    dst = p_dst + width * cutoff;
+    memset(dst, 0, sizeof(int16_t) * width * skip_height);
+  }
+}
+
+
+static void fast_inverse_tr_32x4_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+
+  const __m256i* v_src_raw = (const __m256i*)src;
+
+  __m256i v_madd_lo_0[2][4];
+  __m256i v_madd_lo_1[2][4];
+  __m256i v_madd_hi_0[2][4];
+  __m256i v_madd_hi_1[2][4];
+  const __m256i* v_c_ptr = v_coeff;
+  for (int src = 0; src < 2; ++src) {
+    __m256i v_src_lo_0 = _mm256_unpacklo_epi16(v_src_raw[0], v_src_raw[2]);
+    __m256i v_src_lo_1 = _mm256_unpacklo_epi16(v_src_raw[1], v_src_raw[3]);
+    __m256i v_src_hi_0 = _mm256_unpackhi_epi16(v_src_raw[0], v_src_raw[2]);
+    __m256i v_src_hi_1 = _mm256_unpackhi_epi16(v_src_raw[1], v_src_raw[3]);
+
+    for (int i = 0; i < 4; i++) {
+      v_madd_lo_0[src][i] = _mm256_madd_epi16(v_src_lo_0, v_c_ptr[i]);
+      v_madd_lo_1[src][i] = _mm256_madd_epi16(v_src_lo_1, v_c_ptr[i]);
+      v_madd_hi_0[src][i] = _mm256_madd_epi16(v_src_hi_0, v_c_ptr[i]);
+      v_madd_hi_1[src][i] = _mm256_madd_epi16(v_src_hi_1, v_c_ptr[i]);
+    }
+    v_c_ptr += 4;
+    v_src_raw += 4;
+  }
+
+  __m256i v_trunc_lo[8];
+  __m256i v_trunc_hi[8];
+  for (int dst = 0, src = 0; src < 4; ++src, dst += 2) {
+    v_trunc_lo[dst + 0] = truncate_avx2(_mm256_add_epi32(v_madd_lo_0[0][src], v_madd_lo_0[1][src]), debias, shift);
+    v_trunc_lo[dst + 1] = truncate_avx2(_mm256_add_epi32(v_madd_lo_1[0][src], v_madd_lo_1[1][src]), debias, shift);
+    v_trunc_hi[dst + 0] = truncate_avx2(_mm256_add_epi32(v_madd_hi_0[0][src], v_madd_hi_0[1][src]), debias, shift);
+    v_trunc_hi[dst + 1] = truncate_avx2(_mm256_add_epi32(v_madd_hi_1[0][src], v_madd_hi_1[1][src]), debias, shift);
+  }
+
+  dst[0] = _mm256_packs_epi32(v_trunc_lo[0], v_trunc_lo[2]);
+  dst[2] = _mm256_packs_epi32(v_trunc_hi[0], v_trunc_hi[2]);
+  dst[4] = _mm256_packs_epi32(v_trunc_lo[4], v_trunc_lo[6]);
+  dst[6] = _mm256_packs_epi32(v_trunc_hi[4], v_trunc_hi[6]);
+
+  if(skip_line == 0) {
+    dst[1] = _mm256_packs_epi32(v_trunc_lo[1], v_trunc_lo[3]);
+    dst[3] = _mm256_packs_epi32(v_trunc_hi[1], v_trunc_hi[3]);
+    dst[5] = _mm256_packs_epi32(v_trunc_lo[5], v_trunc_lo[7]);
+    dst[7] = _mm256_packs_epi32(v_trunc_hi[5], v_trunc_hi[7]);
+  }
+  else {
+    dst[1] = _mm256_setzero_si256();
+    dst[3] = _mm256_setzero_si256();
+    dst[5] = _mm256_setzero_si256();
+    dst[7] = _mm256_setzero_si256();
+  }
+
+  // TODO: mts cutoff
+}
+static void fast_inverse_tr_32x4_avx2_mts_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) {
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+
+  const __m256i* v_src_raw = (const __m256i*)src;
+
+  __m256i v_madd_lo_0[2][4];
+  __m256i v_madd_hi_0[2][4];
+  const __m256i* v_c_ptr = v_coeff;
+  for (int src = 0; src < 2; ++src) {
+    __m256i v_src_lo_0 = _mm256_unpacklo_epi16(v_src_raw[0], v_src_raw[2]);
+    __m256i v_src_hi_0 = _mm256_unpackhi_epi16(v_src_raw[0], v_src_raw[2]);
+
+    for (int i = 0; i < 4; i++) {
+      v_madd_lo_0[src][i] = _mm256_madd_epi16(v_src_lo_0, v_c_ptr[i]);
+      v_madd_hi_0[src][i] = _mm256_madd_epi16(v_src_hi_0, v_c_ptr[i]);
+    }
+    v_c_ptr += 4;
+    v_src_raw += 4;
+  }
+
+  __m256i v_trunc_lo[4];
+  __m256i v_trunc_hi[4];
+  for (int src = 0; src < 4; ++src) {
+    v_trunc_lo[src] = truncate_avx2(_mm256_add_epi32(v_madd_lo_0[0][src], v_madd_lo_0[1][src]), debias, shift);
+    v_trunc_hi[src] = truncate_avx2(_mm256_add_epi32(v_madd_hi_0[0][src], v_madd_hi_0[1][src]), debias, shift);
+  }
+
+  dst[0] = _mm256_packs_epi32(v_trunc_lo[0], v_trunc_lo[1]);
+  dst[2] = _mm256_packs_epi32(v_trunc_hi[0], v_trunc_hi[1]);
+  dst[4] = _mm256_packs_epi32(v_trunc_lo[2], v_trunc_lo[3]);
+  dst[6] = _mm256_packs_epi32(v_trunc_hi[2], v_trunc_hi[3]);
+
+  dst[1] = _mm256_setzero_si256();
+  dst[3] = _mm256_setzero_si256();
+  dst[5] = _mm256_setzero_si256();
+  dst[7] = _mm256_setzero_si256();
+  
+
+  // TODO: mts cutoff
+}
+
+static void fast_inverse_tr_32x4_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const int64_t* c_ptr = (const int64_t*)coeff; // Cast to 64 bit integer to read four coeffs at a time
+  const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)fi_tr_4x8_result_shuffle_ver); // Can use existing shuffle vector
+
+  __m256i v_src[8];
+  v_src[0] = _mm256_permute2x128_si256(src[0], src[4], 0x20);
+  v_src[1] = _mm256_permute2x128_si256(src[2], src[6], 0x20);
+  v_src[2] = _mm256_permute2x128_si256(src[0], src[4], 0x31);
+  v_src[3] = _mm256_permute2x128_si256(src[2], src[6], 0x31);
+
+  v_src[4] = _mm256_permute2x128_si256(src[1], src[5], 0x20);
+  v_src[5] = _mm256_permute2x128_si256(src[3], src[7], 0x20);
+  v_src[6] = _mm256_permute2x128_si256(src[1], src[5], 0x31);
+  v_src[7] = _mm256_permute2x128_si256(src[3], src[7], 0x31);
+
+  __m256i v_add[32];
+  for (int i = 0; i < 32; ++i) {
+    __m256i v_coeff_0 = _mm256_set1_epi64x(c_ptr[0]);
+    __m256i v_coeff_1 = _mm256_set1_epi64x(c_ptr[1]);
+    __m256i v_coeff_2 = _mm256_set1_epi64x(c_ptr[2]);
+    __m256i v_coeff_3 = _mm256_set1_epi64x(c_ptr[3]);
+    __m256i v_coeff_4 = _mm256_set1_epi64x(c_ptr[4]);
+    __m256i v_coeff_5 = _mm256_set1_epi64x(c_ptr[5]);
+    __m256i v_coeff_6 = _mm256_set1_epi64x(c_ptr[6]);
+    __m256i v_coeff_7 = _mm256_set1_epi64x(c_ptr[7]);
+
+    __m256i v_madd_0 = _mm256_madd_epi16(v_src[0], v_coeff_0);
+    __m256i v_madd_1 = _mm256_madd_epi16(v_src[1], v_coeff_1);
+    __m256i v_madd_2 = _mm256_madd_epi16(v_src[2], v_coeff_2);
+    __m256i v_madd_3 = _mm256_madd_epi16(v_src[3], v_coeff_3);
+    __m256i v_madd_4 = _mm256_madd_epi16(v_src[4], v_coeff_4);
+    __m256i v_madd_5 = _mm256_madd_epi16(v_src[5], v_coeff_5);
+    __m256i v_madd_6 = _mm256_madd_epi16(v_src[6], v_coeff_6);
+    __m256i v_madd_7 = _mm256_madd_epi16(v_src[7], v_coeff_7);
+
+    __m256i v_add_00 = _mm256_add_epi32(v_madd_0, v_madd_1);
+    __m256i v_add_01 = _mm256_add_epi32(v_madd_2, v_madd_3);
+    __m256i v_add_02 = _mm256_add_epi32(v_madd_4, v_madd_5);
+    __m256i v_add_03 = _mm256_add_epi32(v_madd_6, v_madd_7);
+
+    __m256i v_add_10 = _mm256_add_epi32(v_add_00, v_add_01);
+    __m256i v_add_11 = _mm256_add_epi32(v_add_02, v_add_03);
+
+    v_add[i] = _mm256_add_epi32(v_add_10, v_add_11);
+    c_ptr += 8;
+  }
+
+  __m256i v_hadd[16];
+  for (int dst = 0, src = 0; dst < 16; ++dst, src += 2) {
+    v_hadd[dst] = _mm256_hadd_epi32(v_add[src + 0], v_add[src + 1]);
+  }
+
+  __m256i v_trunc[16];
+  for (int i = 0; i < 16; ++i) {
+    v_trunc[i] = truncate_avx2(v_hadd[i], debias, shift);
+  }
+
+  __m256i v_result[8];
+  __m256i v_tmp0 = _mm256_packs_epi32(v_trunc[0], v_trunc[1]);
+  __m256i v_tmp1 = _mm256_packs_epi32(v_trunc[2], v_trunc[3]);
+  __m256i v_tmp2 = _mm256_packs_epi32(v_trunc[4], v_trunc[5]);
+  __m256i v_tmp3 = _mm256_packs_epi32(v_trunc[6], v_trunc[7]);
+  __m256i v_tmp4 = _mm256_packs_epi32(v_trunc[8], v_trunc[9]);
+  __m256i v_tmp5 = _mm256_packs_epi32(v_trunc[10], v_trunc[11]);
+  __m256i v_tmp6 = _mm256_packs_epi32(v_trunc[12], v_trunc[13]);
+  __m256i v_tmp7 = _mm256_packs_epi32(v_trunc[14], v_trunc[15]);
+
+  v_tmp0 = _mm256_shuffle_epi8(v_tmp0, v_res_shuffle);
+  v_tmp1 = _mm256_shuffle_epi8(v_tmp1, v_res_shuffle);
+  v_tmp2 = _mm256_shuffle_epi8(v_tmp2, v_res_shuffle);
+  v_tmp3 = _mm256_shuffle_epi8(v_tmp3, v_res_shuffle);
+  v_tmp4 = _mm256_shuffle_epi8(v_tmp4, v_res_shuffle);
+  v_tmp5 = _mm256_shuffle_epi8(v_tmp5, v_res_shuffle);
+  v_tmp6 = _mm256_shuffle_epi8(v_tmp6, v_res_shuffle);
+  v_tmp7 = _mm256_shuffle_epi8(v_tmp7, v_res_shuffle);
+
+  __m256i v_tmp_lo_0 = _mm256_unpacklo_epi64(v_tmp0, v_tmp1);
+  __m256i v_tmp_lo_1 = _mm256_unpacklo_epi64(v_tmp2, v_tmp3);
+  __m256i v_tmp_lo_2 = _mm256_unpacklo_epi64(v_tmp4, v_tmp5);
+  __m256i v_tmp_lo_3 = _mm256_unpacklo_epi64(v_tmp6, v_tmp7);
+  __m256i v_tmp_hi_0 = _mm256_unpackhi_epi64(v_tmp0, v_tmp1);
+  __m256i v_tmp_hi_1 = _mm256_unpackhi_epi64(v_tmp2, v_tmp3);
+  __m256i v_tmp_hi_2 = _mm256_unpackhi_epi64(v_tmp4, v_tmp5);
+  __m256i v_tmp_hi_3 = _mm256_unpackhi_epi64(v_tmp6, v_tmp7);
+
+  v_result[0] = _mm256_permute2x128_si256(v_tmp_lo_0, v_tmp_lo_1, 0x20);
+  v_result[1] = _mm256_permute2x128_si256(v_tmp_lo_2, v_tmp_lo_3, 0x20);
+  v_result[2] = _mm256_permute2x128_si256(v_tmp_hi_0, v_tmp_hi_1, 0x20);
+  v_result[3] = _mm256_permute2x128_si256(v_tmp_hi_2, v_tmp_hi_3, 0x20);
+
+  v_result[4] = _mm256_permute2x128_si256(v_tmp_lo_0, v_tmp_lo_1, 0x31);
+  v_result[5] = _mm256_permute2x128_si256(v_tmp_lo_2, v_tmp_lo_3, 0x31);
+  v_result[6] = _mm256_permute2x128_si256(v_tmp_hi_0, v_tmp_hi_1, 0x31);
+  v_result[7] = _mm256_permute2x128_si256(v_tmp_hi_2, v_tmp_hi_3, 0x31);
+
+  for (int i = 0; i < 8; ++i) {
+    _mm256_store_si256((__m256i*)dst, v_result[i]);
+    dst += 16;
+  }
+  // TODO: cutoff for dct8 and dst7
+}
+static void fast_inverse_tr_32x4_avx2_mts_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) {
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const int64_t* c_ptr = (const int64_t*)coeff; // Cast to 64 bit integer to read four coeffs at a time
+  const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)fi_tr_4x8_result_shuffle_ver); // Can use existing shuffle vector
+
+  __m256i v_src[8];
+  v_src[0] = _mm256_permute2x128_si256(src[0], src[4], 0x20);
+  v_src[1] = _mm256_permute2x128_si256(src[2], src[6], 0x20);
+  v_src[2] = _mm256_permute2x128_si256(src[0], src[4], 0x31);
+  v_src[3] = _mm256_permute2x128_si256(src[2], src[6], 0x31);
+  
+
+  __m256i v_add[32];
+  for (int i = 0; i < 32; ++i) {
+    __m256i v_coeff_0 = _mm256_set1_epi64x(c_ptr[0]);
+    __m256i v_coeff_1 = _mm256_set1_epi64x(c_ptr[1]);
+    __m256i v_coeff_2 = _mm256_set1_epi64x(c_ptr[2]);
+    __m256i v_coeff_3 = _mm256_set1_epi64x(c_ptr[3]);
+
+    __m256i v_madd_0 = _mm256_madd_epi16(v_src[0], v_coeff_0);
+    __m256i v_madd_1 = _mm256_madd_epi16(v_src[1], v_coeff_1);
+    __m256i v_madd_2 = _mm256_madd_epi16(v_src[2], v_coeff_2);
+    __m256i v_madd_3 = _mm256_madd_epi16(v_src[3], v_coeff_3);
+
+    __m256i v_add_00 = _mm256_add_epi32(v_madd_0, v_madd_1);
+    __m256i v_add_01 = _mm256_add_epi32(v_madd_2, v_madd_3);
+
+    __m256i v_add_10 = _mm256_add_epi32(v_add_00, v_add_01);
+
+    v_add[i] = v_add_10;
+    c_ptr += 8;
+  }
+
+  __m256i v_hadd[16];
+  for (int dst = 0, src = 0; dst < 16; ++dst, src += 2) {
+    v_hadd[dst] = _mm256_hadd_epi32(v_add[src + 0], v_add[src + 1]);
+  }
+
+  __m256i v_trunc[16];
+  for (int i = 0; i < 16; ++i) {
+    v_trunc[i] = truncate_avx2(v_hadd[i], debias, shift);
+  }
+
+  __m256i v_result[8];
+  __m256i v_tmp0 = _mm256_packs_epi32(v_trunc[0], v_trunc[1]);
+  __m256i v_tmp1 = _mm256_packs_epi32(v_trunc[2], v_trunc[3]);
+  __m256i v_tmp2 = _mm256_packs_epi32(v_trunc[4], v_trunc[5]);
+  __m256i v_tmp3 = _mm256_packs_epi32(v_trunc[6], v_trunc[7]);
+  __m256i v_tmp4 = _mm256_packs_epi32(v_trunc[8], v_trunc[9]);
+  __m256i v_tmp5 = _mm256_packs_epi32(v_trunc[10], v_trunc[11]);
+  __m256i v_tmp6 = _mm256_packs_epi32(v_trunc[12], v_trunc[13]);
+  __m256i v_tmp7 = _mm256_packs_epi32(v_trunc[14], v_trunc[15]);
+
+  v_tmp0 = _mm256_shuffle_epi8(v_tmp0, v_res_shuffle);
+  v_tmp1 = _mm256_shuffle_epi8(v_tmp1, v_res_shuffle);
+  v_tmp2 = _mm256_shuffle_epi8(v_tmp2, v_res_shuffle);
+  v_tmp3 = _mm256_shuffle_epi8(v_tmp3, v_res_shuffle);
+  v_tmp4 = _mm256_shuffle_epi8(v_tmp4, v_res_shuffle);
+  v_tmp5 = _mm256_shuffle_epi8(v_tmp5, v_res_shuffle);
+  v_tmp6 = _mm256_shuffle_epi8(v_tmp6, v_res_shuffle);
+  v_tmp7 = _mm256_shuffle_epi8(v_tmp7, v_res_shuffle);
+
+  __m256i v_tmp_lo_0 = _mm256_unpacklo_epi64(v_tmp0, v_tmp1);
+  __m256i v_tmp_lo_1 = _mm256_unpacklo_epi64(v_tmp2, v_tmp3);
+  __m256i v_tmp_lo_2 = _mm256_unpacklo_epi64(v_tmp4, v_tmp5);
+  __m256i v_tmp_lo_3 = _mm256_unpacklo_epi64(v_tmp6, v_tmp7);
+  __m256i v_tmp_hi_0 = _mm256_unpackhi_epi64(v_tmp0, v_tmp1);
+  __m256i v_tmp_hi_1 = _mm256_unpackhi_epi64(v_tmp2, v_tmp3);
+  __m256i v_tmp_hi_2 = _mm256_unpackhi_epi64(v_tmp4, v_tmp5);
+  __m256i v_tmp_hi_3 = _mm256_unpackhi_epi64(v_tmp6, v_tmp7);
+
+  v_result[0] = _mm256_permute2x128_si256(v_tmp_lo_0, v_tmp_lo_1, 0x20);
+  v_result[1] = _mm256_permute2x128_si256(v_tmp_lo_2, v_tmp_lo_3, 0x20);
+  v_result[2] = _mm256_permute2x128_si256(v_tmp_hi_0, v_tmp_hi_1, 0x20);
+  v_result[3] = _mm256_permute2x128_si256(v_tmp_hi_2, v_tmp_hi_3, 0x20);
+
+  v_result[4] = _mm256_permute2x128_si256(v_tmp_lo_0, v_tmp_lo_1, 0x31);
+  v_result[5] = _mm256_permute2x128_si256(v_tmp_lo_2, v_tmp_lo_3, 0x31);
+  v_result[6] = _mm256_permute2x128_si256(v_tmp_hi_0, v_tmp_hi_1, 0x31);
+  v_result[7] = _mm256_permute2x128_si256(v_tmp_hi_2, v_tmp_hi_3, 0x31);
+
+  for (int i = 0; i < 8; ++i) {
+    _mm256_store_si256((__m256i*)dst, v_result[i]);
+    dst += 16;
+  }
+  // TODO: cutoff for dct8 and dst7
+}
+
+void fast_inverse_tr_32x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 32;
+  const int height = 4;
+
+  int skip_width = (hor != DCT2 && width == 32) ? 16 : width > 32 ? width - 32 : 0; 
+  int skip_height = 0;
+
+  const int32_t shift_1st = INVERSE_SHIFT_1ST;
+  const int32_t shift_2nd = INVERSE_SHIFT_2ND;
+
+  const int16_t* ver_coeff = fi_dct2_4x32_coeff_hor; // TODO: rename
+  const int16_t* hor_coeff = uvg_g_dct_32_t;
+  if (hor == DST7) {
+    hor_coeff = uvg_g_dst7_32_t;
+  } else if (hor == DCT8) {
+    hor_coeff = uvg_g_dct8_32;
+  }
+  if (ver == DST7) {
+    ver_coeff = fi_dst7_4x32_coeff_hor; // TODO: rename
+  } else if (ver == DCT8) {
+    ver_coeff = fi_dct8_4x32_coeff_hor; // TODO: rename
+  }
+
+  __m256i v_ver_pass_out[8];
+  if(ver == DCT2) {
+    fast_inverse_tr_32x4_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height);
+  }
+  else {
+    fast_inverse_tr_32x4_avx2_mts_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height);
+  }
+
+  if(hor == DCT2) {
+    fast_inverse_tr_32x4_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width);
+  }
+  else {
+    fast_inverse_tr_32x4_avx2_mts_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width);
+  }
+}
+
+
+void fast_forward_tr_32x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 32;
+  const int height = 8;
+  
+  int skip_width = (hor != DCT2 && width == 32) ? 16 : width > 32 ? width - 32 : 0;
+  int skip_height = 0;
+
+  const int  reduced_line = width - skip_width;
+  const int  cutoff = height - skip_height;
+  int16_t* p_dst = dst;
+
+  const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1;
+  const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
+  const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8;
+  const int32_t shift_2nd = log2_height_minus1 + 7;
+
+  const int16_t* hor_coeff = ff_dct2_32xN_coeff_hor;
+  const int16_t* ver_coeff = ff_dct2_32x8_coeff_ver;
+  if (hor == DST7) {
+    hor_coeff = ff_dst7_32xN_coeff_hor;
+  } else if (hor == DCT8) {
+    hor_coeff = ff_dct8_32xN_coeff_hor;
+  }
+  if (ver == DST7) {
+    ver_coeff = ff_dst7_32x8_coeff_ver;
+  } else if (ver == DCT8) {
+    ver_coeff = ff_dct8_32x8_coeff_ver;
+  }
+
+  __m256i v_hor_pass_out[16];
+  if (hor == DCT2) {
+    fast_forward_DCT2_B32_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);
+  }
+  else {
+    fast_forward_DCT8_B32_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);
+  }
+
+  // Vertical pass
+  const int32_t    add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const __m256i debias = _mm256_set1_epi32(add);
+  // Same as for the other 32 and other dimension 8 or 16
+  // However all 1,2,4 seem to be producing similar results as with increasing the value
+  // just shifts the pressure from one point to another
+#define NUM_PARTS 4
+#define PART_DIMENSION (8 / NUM_PARTS)
+  for (int part = 0; part < NUM_PARTS; ++part) {
+    // Got data for 16 vectors, 8 lines 32 samples each
+    // Handle two lines at a time
+    __m256i v_madd_lo_even[4][PART_DIMENSION];
+    __m256i  v_madd_lo_odd[4][PART_DIMENSION];
+    __m256i v_madd_hi_even[4][PART_DIMENSION];
+    __m256i  v_madd_hi_odd[4][PART_DIMENSION];
+    __m256i* v_src_ptr = v_hor_pass_out;
+    const __m256i* v_coeff = (const __m256i*)ver_coeff + part * PART_DIMENSION;
+    for (int i = 0; i < 4; ++i) {
+      __m256i v_src_hi_0 = _mm256_unpackhi_epi16(v_src_ptr[0], v_src_ptr[2]);
+      __m256i v_src_lo_0 = _mm256_unpacklo_epi16(v_src_ptr[0], v_src_ptr[2]);
+      __m256i v_src_hi_1 = _mm256_unpackhi_epi16(v_src_ptr[1], v_src_ptr[3]);
+      __m256i v_src_lo_1 = _mm256_unpacklo_epi16(v_src_ptr[1], v_src_ptr[3]);
+
+      // Apply coeffs
+      for (int ii = 0; ii < PART_DIMENSION; ++ii) {
+        v_madd_lo_even[i][ii] = _mm256_madd_epi16(v_src_lo_0, v_coeff[ii]);
+        v_madd_hi_even[i][ii] = _mm256_madd_epi16(v_src_hi_0, v_coeff[ii]);
+        v_madd_lo_odd[i][ii]  = _mm256_madd_epi16(v_src_lo_1, v_coeff[ii]);
+        v_madd_hi_odd[i][ii]  = _mm256_madd_epi16(v_src_hi_1, v_coeff[ii]);
+      }
+
+      v_coeff += 8;
+      v_src_ptr += 4;
+    }
+
+    // First round of additions
+    __m256i v_add_lo_even[2][PART_DIMENSION];
+    __m256i v_add_hi_even[2][PART_DIMENSION];
+    __m256i  v_add_lo_odd[2][PART_DIMENSION];
+    __m256i  v_add_hi_odd[2][PART_DIMENSION];
+    for (int i = 0; i < 2; ++i) {
+      const int offset = 2 * i;
+      for (int ii = 0; ii < PART_DIMENSION; ++ii) {
+        v_add_lo_even[i][ii] = _mm256_add_epi32(v_madd_lo_even[offset][ii], v_madd_lo_even[offset + 1][ii]);
+        v_add_hi_even[i][ii] = _mm256_add_epi32(v_madd_hi_even[offset][ii], v_madd_hi_even[offset + 1][ii]);
+        v_add_lo_odd[i][ii]  = _mm256_add_epi32(v_madd_lo_odd[offset][ii], v_madd_lo_odd[offset + 1][ii]);
+        v_add_hi_odd[i][ii]  = _mm256_add_epi32(v_madd_hi_odd[offset][ii], v_madd_hi_odd[offset + 1][ii]);
+      }
+    }
+
+    // Final add and truncate
+    __m256i v_trunc_lo_even[PART_DIMENSION];
+    __m256i v_trunc_hi_even[PART_DIMENSION];
+    __m256i v_trunc_lo_odd[PART_DIMENSION];
+    __m256i v_trunc_hi_odd[PART_DIMENSION];
+    for (int ii = 0; ii < PART_DIMENSION; ++ii) {
+      v_trunc_lo_even[ii] = truncate_avx2(_mm256_add_epi32(v_add_lo_even[0][ii], v_add_lo_even[1][ii]), debias, shift_2nd);
+      v_trunc_hi_even[ii] = truncate_avx2(_mm256_add_epi32(v_add_hi_even[0][ii], v_add_hi_even[1][ii]), debias, shift_2nd);
+      v_trunc_lo_odd[ii]  = truncate_avx2(_mm256_add_epi32(v_add_lo_odd[0][ii], v_add_lo_odd[1][ii]), debias, shift_2nd);
+      v_trunc_hi_odd[ii]  = truncate_avx2(_mm256_add_epi32(v_add_hi_odd[0][ii], v_add_hi_odd[1][ii]), debias, shift_2nd);
+    }
+
+    // Permute and store
+    for (int i = 0; i < PART_DIMENSION; ++i) {
+      __m256i v_result_even = _mm256_packs_epi32(v_trunc_lo_even[i], v_trunc_hi_even[i]);
+      __m256i v_result_odd = _mm256_packs_epi32(v_trunc_lo_odd[i], v_trunc_hi_odd[i]);
+      // Flip the middle 64 bit chunks
+      v_result_even = _mm256_permute4x64_epi64(v_result_even, _MM_SHUFFLE(3, 1, 2, 0));
+      v_result_odd = _mm256_permute4x64_epi64(v_result_odd, _MM_SHUFFLE(3, 1, 2, 0));
+      _mm256_store_si256((__m256i*)dst, v_result_even);
+      _mm256_store_si256((__m256i*)(dst + 16), v_result_odd);
+      dst += 32;
+    }
+  }
+#undef NUM_PARTS
+#undef PART_DIMENSION
+  if (skip_width) {
+    dst = p_dst + reduced_line;
+    for (int j = 0; j < cutoff; j++)
+    {
+      memset(dst, 0, sizeof(int16_t) * skip_width);
+      dst += width;
+    }
+  }
+
+  if (skip_height) {
+    dst = p_dst + width * cutoff;
+    memset(dst, 0, sizeof(int16_t) * width * skip_height);
+  }
+}
+
+
+static void fast_inverse_tr_32x8_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+  const __m256i* v_src_raw = (const __m256i*)src;
+
+  __m256i v_src_lo[8];
+  __m256i v_src_hi[8];
+  for (int d = 0, s = 0; d < 8; d += 2, s += 4) {
+    v_src_lo[d + 0] = _mm256_unpacklo_epi16(v_src_raw[s + 0], v_src_raw[s + 2]);
+    v_src_lo[d + 1] = _mm256_unpacklo_epi16(v_src_raw[s + 1], v_src_raw[s + 3]);
+
+    v_src_hi[d + 0] = _mm256_unpackhi_epi16(v_src_raw[s + 0], v_src_raw[s + 2]);
+    v_src_hi[d + 1] = _mm256_unpackhi_epi16(v_src_raw[s + 1], v_src_raw[s + 3]);
+  }
+
+  for (int c = 0; c < 8; ++c) {
+    __m256i v_madd_lo_0[4];
+    __m256i v_madd_lo_1[4];
+    __m256i v_madd_hi_0[4];
+    __m256i v_madd_hi_1[4];
+    for (int d = 0, s = 0; d < 4; ++d, s += 2) {
+      v_madd_lo_0[d] = _mm256_madd_epi16(v_src_lo[s + 0], v_coeff[d]);
+      v_madd_lo_1[d] = _mm256_madd_epi16(v_src_lo[s + 1], v_coeff[d]);
+      v_madd_hi_0[d] = _mm256_madd_epi16(v_src_hi[s + 0], v_coeff[d]);
+      v_madd_hi_1[d] = _mm256_madd_epi16(v_src_hi[s + 1], v_coeff[d]);
+    }
+    v_coeff += 4;
+
+    __m256i v_add_lo_00 = _mm256_add_epi32(v_madd_lo_0[0], v_madd_lo_0[1]);
+    __m256i v_add_lo_01 = _mm256_add_epi32(v_madd_lo_0[2], v_madd_lo_0[3]);
+    __m256i v_add_lo_10 = _mm256_add_epi32(v_madd_lo_1[0], v_madd_lo_1[1]);
+    __m256i v_add_lo_11 = _mm256_add_epi32(v_madd_lo_1[2], v_madd_lo_1[3]);
+
+    __m256i v_add_hi_00 = _mm256_add_epi32(v_madd_hi_0[0], v_madd_hi_0[1]);
+    __m256i v_add_hi_01 = _mm256_add_epi32(v_madd_hi_0[2], v_madd_hi_0[3]);
+    __m256i v_add_hi_10 = _mm256_add_epi32(v_madd_hi_1[0], v_madd_hi_1[1]);
+    __m256i v_add_hi_11 = _mm256_add_epi32(v_madd_hi_1[2], v_madd_hi_1[3]);
+
+    __m256i v_trunc_lo_0 = truncate_avx2(_mm256_add_epi32(v_add_lo_00, v_add_lo_01), debias, shift);
+    __m256i v_trunc_lo_1 = truncate_avx2(_mm256_add_epi32(v_add_lo_10, v_add_lo_11), debias, shift);
+
+    __m256i v_trunc_hi_0 = truncate_avx2(_mm256_add_epi32(v_add_hi_00, v_add_hi_01), debias, shift);
+    __m256i v_trunc_hi_1 = truncate_avx2(_mm256_add_epi32(v_add_hi_10, v_add_hi_11), debias, shift);
+
+    dst[0] = _mm256_packs_epi32(v_trunc_lo_0, v_trunc_hi_0);
+    dst[1] = _mm256_packs_epi32(v_trunc_lo_1, v_trunc_hi_1);
+    dst += 2;
+  }
+
+  // TODO: mts cutoff
+}
+
+static void fast_inverse_tr_32x8_avx2_mts_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+  const __m256i* v_src_raw = (const __m256i*)src;
+
+  __m256i v_src_lo[4];
+  __m256i v_src_hi[4];
+  for (int d = 0, s = 0; d < 4; d += 1, s += 4) {
+    v_src_lo[d + 0] = _mm256_unpacklo_epi16(v_src_raw[s + 0], v_src_raw[s + 2]);
+    v_src_hi[d + 0] = _mm256_unpackhi_epi16(v_src_raw[s + 0], v_src_raw[s + 2]);
+  }
+
+  for (int c = 0; c < 8; ++c) {
+    __m256i v_madd_lo_0[4];
+    __m256i v_madd_hi_0[4];
+    for (int d = 0, s = 0; d < 4; ++d, s += 1) {
+      v_madd_lo_0[d] = _mm256_madd_epi16(v_src_lo[s + 0], v_coeff[d]);
+      v_madd_hi_0[d] = _mm256_madd_epi16(v_src_hi[s + 0], v_coeff[d]);
+    }
+    v_coeff += 4;
+
+    __m256i v_add_lo_00 = _mm256_add_epi32(v_madd_lo_0[0], v_madd_lo_0[1]);
+    __m256i v_add_lo_01 = _mm256_add_epi32(v_madd_lo_0[2], v_madd_lo_0[3]);
+
+    __m256i v_add_hi_00 = _mm256_add_epi32(v_madd_hi_0[0], v_madd_hi_0[1]);
+    __m256i v_add_hi_01 = _mm256_add_epi32(v_madd_hi_0[2], v_madd_hi_0[3]);
+
+    __m256i v_trunc_lo_0 = truncate_avx2(_mm256_add_epi32(v_add_lo_00, v_add_lo_01), debias, shift);
+
+    __m256i v_trunc_hi_0 = truncate_avx2(_mm256_add_epi32(v_add_hi_00, v_add_hi_01), debias, shift);
+
+    dst[0] = _mm256_packs_epi32(v_trunc_lo_0, v_trunc_hi_0);
+    dst[1] = _mm256_setzero_si256();
+    dst += 2;
+  }
+
+  // TODO: mts cutoff
+}
+
+static void fast_inverse_tr_32x8_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const int limit = skip_line2 == 16 ? 8 : 16;
+
+  int32_t *src_32 = (int32_t*)src;
+  for (int j = 0; j < line; ++j) {
+    __m256i res_0 = _mm256_setzero_si256();
+    __m256i res_1 = _mm256_setzero_si256();
+    __m256i res_2 = _mm256_setzero_si256();
+    __m256i res_3 = _mm256_setzero_si256();
+
+    __m256i *coeff_start = (__m256i*)coeff;
+    for (int i = 0; i < limit; ++i) {
+      __m256i v_src = _mm256_set1_epi32(*src_32);
+      src_32++;
+
+      __m256i v_coeff0 = _mm256_loadu_si256(coeff_start);
+      coeff_start++;
+      __m256i v_coeff1 = _mm256_loadu_si256(coeff_start);
+      coeff_start++;
+      __m256i v_coeff2 = _mm256_loadu_si256(coeff_start);
+      coeff_start++;
+      __m256i v_coeff3 = _mm256_loadu_si256(coeff_start);
+      coeff_start++;
+
+      __m256i madd0 = _mm256_madd_epi16(v_src, v_coeff0);
+      __m256i madd1 = _mm256_madd_epi16(v_src, v_coeff1);
+      __m256i madd2 = _mm256_madd_epi16(v_src, v_coeff2);
+      __m256i madd3 = _mm256_madd_epi16(v_src, v_coeff3);
+
+      res_0 = _mm256_add_epi32(res_0, madd0);
+      res_1 = _mm256_add_epi32(res_1, madd1);
+      res_2 = _mm256_add_epi32(res_2, madd2);
+      res_3 = _mm256_add_epi32(res_3, madd3);
+    }
+    src_32 += limit == 8 ? 8 : 0;
+
+    __m256i v_trunk0 = truncate_avx2(res_0, debias, shift);
+    __m256i v_trunk1 = truncate_avx2(res_1, debias, shift);
+    __m256i v_trunk2 = truncate_avx2(res_2, debias, shift);
+    __m256i v_trunk3 = truncate_avx2(res_3, debias, shift);
+
+    __m256i packed0 =  _mm256_packs_epi32(v_trunk0, v_trunk1);
+    __m256i packed1 =  _mm256_packs_epi32(v_trunk2, v_trunk3);
+
+    packed0 = _mm256_permute4x64_epi64(packed0, _MM_SHUFFLE(3, 1, 2, 0));
+    packed1 = _mm256_permute4x64_epi64(packed1, _MM_SHUFFLE(3, 1, 2, 0));
+
+    _mm256_store_si256((__m256i*)dst, packed0);
+    _mm256_store_si256((__m256i*)dst + 1, packed1);
+    dst += 32;
+  }
+  
+  // TODO: cutoff for dct8 and dst7
+}
+
+void fast_inverse_tr_32x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 32;
+  const int height = 8;
+
+  int skip_width = (hor != DCT2 && width == 32) ? 16 : width > 32 ? width - 32 : 0;
+  int skip_height = 0;
+
+  const int32_t shift_1st = INVERSE_SHIFT_1ST;
+  const int32_t shift_2nd = INVERSE_SHIFT_2ND;
+
+  const int16_t* ver_coeff = fi_dct2_8x32_coeff_hor; // TODO: rename this table
+  const int16_t* hor_coeff = fi_dct2_32xN_coeff_hor;
+  if (hor == DST7) {
+    hor_coeff = fi_dst7_32xN_coeff_hor;
+  } else if (hor == DCT8) {
+    hor_coeff = fi_dct8_32xN_coeff_hor;
+  }
+  if (ver == DST7) {
+    ver_coeff = fi_dst7_8x32_coeff_hor; // TODO: rename
+  } else if (ver == DCT8) {
+    ver_coeff = fi_dct8_8x32_coeff_hor; // TODO: rename
+  }
+
+  __m256i v_ver_pass_out[16];
+  if(ver == DCT2) {
+    fast_inverse_tr_32x8_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, height, 0, skip_width);
+  }
+  else {
+    fast_inverse_tr_32x8_avx2_mts_ver(src, v_ver_pass_out, ver_coeff, shift_1st, height, 0, skip_width);    
+  }
+
+  fast_inverse_tr_32x8_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width);
+}
+
+
+void fast_forward_tr_32x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 32;
+  const int height = 16;
+  
+  int skip_width = (ver != DCT2 && width == 32) ? 16 : width > 32 ? width - 32 : 0;
+  int skip_height = 0;
+
+  const int  reduced_line = width - skip_width;
+  const int  cutoff = height - skip_height;
+  int16_t* p_dst = dst;
+
+  const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1;
+  const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
+  const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8;
+  const int32_t shift_2nd = log2_height_minus1 + 7;
+
+  const int16_t* hor_coeff = ff_dct2_32xN_coeff_hor;
+  const int16_t* ver_coeff = ff_dct2_32x16_coeff_ver;
+  if (hor == DST7) {
+    hor_coeff = ff_dst7_32xN_coeff_hor;
+  } else if (hor == DCT8) {
+    hor_coeff = ff_dct8_32xN_coeff_hor;
+  }
+  if (ver == DST7) {
+    ver_coeff = ff_dst7_32x16_coeff_ver;
+  } else if (ver == DCT8) {
+    ver_coeff = ff_dct8_32x16_coeff_ver;
+  }
+
+  __m256i v_hor_pass_out[32];
+  if (hor == DCT2) {
+    fast_forward_DCT2_B32_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);
+  }
+  else {
+    fast_forward_DCT8_B32_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);
+  }
+
+  // Vertical pass
+  const int32_t    add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const __m256i debias = _mm256_set1_epi32(add);
+  // Same as for 8x32 and 16x32, 4 parts is optimal
+#define NUM_PARTS 4
+#define PART_DIMENSION (16 / NUM_PARTS)
+  for (int part = 0; part < NUM_PARTS; ++part) {
+    // Got samples for 32 vectors, 16 lines with 32 samples each
+    // Handle two lines at a time
+    __m256i v_madd_lo_even[8][PART_DIMENSION];
+    __m256i  v_madd_lo_odd[8][PART_DIMENSION];
+    __m256i v_madd_hi_even[8][PART_DIMENSION];
+    __m256i  v_madd_hi_odd[8][PART_DIMENSION];
+    __m256i* v_src_ptr = v_hor_pass_out;
+    const int32_t* line_coeff = (const int32_t*)ver_coeff + part * PART_DIMENSION;
+    for (int i = 0; i < 8; ++i) {
+      __m256i v_src_hi_0 = _mm256_unpackhi_epi16(v_src_ptr[0], v_src_ptr[2]);
+      __m256i v_src_lo_0 = _mm256_unpacklo_epi16(v_src_ptr[0], v_src_ptr[2]);
+      __m256i v_src_hi_1 = _mm256_unpackhi_epi16(v_src_ptr[1], v_src_ptr[3]);
+      __m256i v_src_lo_1 = _mm256_unpacklo_epi16(v_src_ptr[1], v_src_ptr[3]);
+
+      // Apply coeffs
+      for (int ii = 0; ii < PART_DIMENSION; ++ii) {
+        const int32_t coeff = line_coeff[ii];
+        const __m256i v_coeff = _mm256_set1_epi32(coeff);
+        v_madd_lo_even[i][ii] = _mm256_madd_epi16(v_src_lo_0, v_coeff);
+        v_madd_hi_even[i][ii] = _mm256_madd_epi16(v_src_hi_0, v_coeff);
+        v_madd_lo_odd[i][ii] = _mm256_madd_epi16(v_src_lo_1, v_coeff);
+        v_madd_hi_odd[i][ii] = _mm256_madd_epi16(v_src_hi_1, v_coeff);
+      }
+
+      line_coeff += 16;
+      v_src_ptr += 4;
+    }
+
+    for (int ii = 0; ii < PART_DIMENSION; ++ii) {
+      // First round of additions
+      __m256i v_add_lo_even_0[4];
+      __m256i v_add_hi_even_0[4];
+      __m256i v_add_lo_odd_0[4];
+      __m256i v_add_hi_odd_0[4];
+      for (int i = 0; i < 4; ++i) {
+        const int offset = i * 2;
+        v_add_lo_even_0[i] = _mm256_add_epi32(v_madd_lo_even[offset][ii], v_madd_lo_even[offset + 1][ii]);
+        v_add_hi_even_0[i] = _mm256_add_epi32(v_madd_hi_even[offset][ii], v_madd_hi_even[offset + 1][ii]);
+        v_add_lo_odd_0[i] = _mm256_add_epi32(v_madd_lo_odd[offset][ii], v_madd_lo_odd[offset + 1][ii]);
+        v_add_hi_odd_0[i] = _mm256_add_epi32(v_madd_hi_odd[offset][ii], v_madd_hi_odd[offset + 1][ii]);
+      }
+
+      // Second round of additions
+      __m256i v_add_lo_even_1[2];
+      __m256i v_add_hi_even_1[2];
+      __m256i v_add_lo_odd_1[2];
+      __m256i v_add_hi_odd_1[2];
+      for (int i = 0; i < 2; ++i) {
+        const int offset = 2 * i;
+        v_add_lo_even_1[i] = _mm256_add_epi32(v_add_lo_even_0[offset], v_add_lo_even_0[offset + 1]);
+        v_add_hi_even_1[i] = _mm256_add_epi32(v_add_hi_even_0[offset], v_add_hi_even_0[offset + 1]);
+        v_add_lo_odd_1[i] = _mm256_add_epi32(v_add_lo_odd_0[offset], v_add_lo_odd_0[offset + 1]);
+        v_add_hi_odd_1[i] = _mm256_add_epi32(v_add_hi_odd_0[offset], v_add_hi_odd_0[offset + 1]);
+      }
+
+      // Final add and truncate
+      __m256i v_trunc_lo_even;
+      __m256i v_trunc_hi_even;
+      __m256i v_trunc_lo_odd;
+      __m256i v_trunc_hi_odd;
+      v_trunc_lo_even = truncate_avx2(_mm256_add_epi32(v_add_lo_even_1[0], v_add_lo_even_1[1]), debias, shift_2nd);
+      v_trunc_hi_even = truncate_avx2(_mm256_add_epi32(v_add_hi_even_1[0], v_add_hi_even_1[1]), debias, shift_2nd);
+      v_trunc_lo_odd = truncate_avx2(_mm256_add_epi32(v_add_lo_odd_1[0], v_add_lo_odd_1[1]), debias, shift_2nd);
+      v_trunc_hi_odd = truncate_avx2(_mm256_add_epi32(v_add_hi_odd_1[0], v_add_hi_odd_1[1]), debias, shift_2nd);
+
+
+      // Permute and store
+      __m256i v_result_even = _mm256_packs_epi32(v_trunc_lo_even, v_trunc_hi_even);
+      __m256i v_result_odd = _mm256_packs_epi32(v_trunc_lo_odd, v_trunc_hi_odd);
+      // Flip the middle 64 bit chunks
+      v_result_even = _mm256_permute4x64_epi64(v_result_even, _MM_SHUFFLE(3, 1, 2, 0));
+      v_result_odd = _mm256_permute4x64_epi64(v_result_odd, _MM_SHUFFLE(3, 1, 2, 0));
+      _mm256_store_si256((__m256i*)dst, v_result_even);
+      _mm256_store_si256((__m256i*)(dst + 16), v_result_odd);
+      dst += 32;
+    }
+  }
+#undef NUM_PARTS
+#undef PART_DIMENSION
+  if (skip_width) {
+    dst = p_dst + reduced_line;
+    for (int j = 0; j < cutoff; j++)
+    {
+      memset(dst, 0, sizeof(int16_t) * skip_width);
+      dst += width;
+    }
+  }
+
+  if (skip_height) {
+    dst = p_dst + width * cutoff;
+    memset(dst, 0, sizeof(int16_t) * width * skip_height);
+  }
+}
+
+
+static void fast_inverse_tr_32x16_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const int limit = 32 - skip_line;
+  __m256i temp[32];
+  for (int j = 0; j < limit; ++j) {
+    __m256i res_0 = _mm256_setzero_si256();
+    __m256i res_1 = _mm256_setzero_si256();
+
+    __m256i* coeff_start = (__m256i*)coeff;
+    for (int i = 0; i < 8; ++i) {
+      int16_t source[2];
+      source[0] = src[j + i * 64];
+      source[1] = src[j + i * 64 + 32];
+      int32_t paired_source;
+      memcpy(&paired_source, source, sizeof(int32_t));
+
+      __m256i v_src = _mm256_set1_epi32(paired_source);
+
+      __m256i v_coeff0 = _mm256_load_si256(coeff_start);
+      coeff_start++;
+      __m256i v_coeff1 = _mm256_load_si256(coeff_start);
+      coeff_start++;
+
+      __m256i v_madd0 = _mm256_madd_epi16(v_src, v_coeff0);
+      __m256i v_madd1 = _mm256_madd_epi16(v_src, v_coeff1);
+
+      res_0 = _mm256_add_epi32(res_0, v_madd0);
+      res_1 = _mm256_add_epi32(res_1, v_madd1);
+    }
+
+    __m256i v_trunc0 = truncate_avx2(res_0, debias, shift);
+    __m256i v_trunc1 = truncate_avx2(res_1, debias, shift);
+
+    __m256i packed = _mm256_packs_epi32(v_trunc0, v_trunc1);
+    packed = _mm256_permute4x64_epi64(packed, _MM_SHUFFLE(3, 1, 2, 0));
+    temp[j] = packed;
+  }
+  for (int j = limit; j < 32; ++j) {
+    temp[j] = _mm256_setzero_si256();
+  }
+  transpose_avx2(temp, dst, 16, 32);
+}
+
+static void fast_inverse_tr_32x16_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_src_raw = src;
+  const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0415);
+
+  // Do a 32-bit transpose to arrange result from previous pass
+  __m256i v_tmp32_lo_e[8];
+  __m256i v_tmp32_hi_e[8];
+  __m256i v_tmp32_lo_o[8];
+  __m256i v_tmp32_hi_o[8];
+  for (int d = 0, s = 0; d < 8; ++d, s += 4) {
+    v_tmp32_lo_e[d] = _mm256_unpacklo_epi32(v_src_raw[s + 0], v_src_raw[s + 2]);
+    v_tmp32_hi_e[d] = _mm256_unpackhi_epi32(v_src_raw[s + 0], v_src_raw[s + 2]);
+    v_tmp32_lo_o[d] = _mm256_unpacklo_epi32(v_src_raw[s + 1], v_src_raw[s + 3]);
+    v_tmp32_hi_o[d] = _mm256_unpackhi_epi32(v_src_raw[s + 1], v_src_raw[s + 3]);
+  }
+
+  __m256i v_tmp64_lo_e[8];
+  __m256i v_tmp64_hi_e[8];
+  __m256i v_tmp64_lo_o[8];
+  __m256i v_tmp64_hi_o[8];
+  for (int d = 0, s = 0; d < 4; ++d, s += 2) {
+    v_tmp64_lo_e[0 + d] = _mm256_unpacklo_epi64(v_tmp32_lo_e[s + 0], v_tmp32_lo_e[s + 1]);
+    v_tmp64_lo_e[4 + d] = _mm256_unpacklo_epi64(v_tmp32_hi_e[s + 0], v_tmp32_hi_e[s + 1]);
+
+    v_tmp64_hi_e[0 + d] = _mm256_unpackhi_epi64(v_tmp32_lo_e[s + 0], v_tmp32_lo_e[s + 1]);
+    v_tmp64_hi_e[4 + d] = _mm256_unpackhi_epi64(v_tmp32_hi_e[s + 0], v_tmp32_hi_e[s + 1]);
+
+    v_tmp64_lo_o[0 + d] = _mm256_unpacklo_epi64(v_tmp32_lo_o[s + 0], v_tmp32_lo_o[s + 1]);
+    v_tmp64_lo_o[4 + d] = _mm256_unpacklo_epi64(v_tmp32_hi_o[s + 0], v_tmp32_hi_o[s + 1]);
+
+    v_tmp64_hi_o[0 + d] = _mm256_unpackhi_epi64(v_tmp32_lo_o[s + 0], v_tmp32_lo_o[s + 1]);
+    v_tmp64_hi_o[4 + d] = _mm256_unpackhi_epi64(v_tmp32_hi_o[s + 0], v_tmp32_hi_o[s + 1]);
+  }
+
+  __m256i v_src[32];
+  v_src[0] = _mm256_permute2x128_si256(v_tmp64_lo_e[0], v_tmp64_lo_e[1], 0x20);
+  v_src[1] = _mm256_permute2x128_si256(v_tmp64_hi_e[0], v_tmp64_hi_e[1], 0x20);
+  v_src[2] = _mm256_permute2x128_si256(v_tmp64_lo_e[4], v_tmp64_lo_e[5], 0x20);
+  v_src[3] = _mm256_permute2x128_si256(v_tmp64_hi_e[4], v_tmp64_hi_e[5], 0x20);
+
+  v_src[4] = _mm256_permute2x128_si256(v_tmp64_lo_e[0], v_tmp64_lo_e[1], 0x31);
+  v_src[5] = _mm256_permute2x128_si256(v_tmp64_hi_e[0], v_tmp64_hi_e[1], 0x31);
+  v_src[6] = _mm256_permute2x128_si256(v_tmp64_lo_e[4], v_tmp64_lo_e[5], 0x31);
+  v_src[7] = _mm256_permute2x128_si256(v_tmp64_hi_e[4], v_tmp64_hi_e[5], 0x31);
+
+  v_src[8] = _mm256_permute2x128_si256(v_tmp64_lo_o[0], v_tmp64_lo_o[1], 0x20);
+  v_src[9] = _mm256_permute2x128_si256(v_tmp64_hi_o[0], v_tmp64_hi_o[1], 0x20);
+  v_src[10] = _mm256_permute2x128_si256(v_tmp64_lo_o[4], v_tmp64_lo_o[5], 0x20);
+  v_src[11] = _mm256_permute2x128_si256(v_tmp64_hi_o[4], v_tmp64_hi_o[5], 0x20);
+
+  v_src[12] = _mm256_permute2x128_si256(v_tmp64_lo_o[0], v_tmp64_lo_o[1], 0x31);
+  v_src[13] = _mm256_permute2x128_si256(v_tmp64_hi_o[0], v_tmp64_hi_o[1], 0x31);
+  v_src[14] = _mm256_permute2x128_si256(v_tmp64_lo_o[4], v_tmp64_lo_o[5], 0x31);
+  v_src[15] = _mm256_permute2x128_si256(v_tmp64_hi_o[4], v_tmp64_hi_o[5], 0x31);
+
+  v_src[16] = _mm256_permute2x128_si256(v_tmp64_lo_e[2], v_tmp64_lo_e[3], 0x20);
+  v_src[17] = _mm256_permute2x128_si256(v_tmp64_hi_e[2], v_tmp64_hi_e[3], 0x20);
+  v_src[18] = _mm256_permute2x128_si256(v_tmp64_lo_e[6], v_tmp64_lo_e[7], 0x20);
+  v_src[19] = _mm256_permute2x128_si256(v_tmp64_hi_e[6], v_tmp64_hi_e[7], 0x20);
+
+  v_src[20] = _mm256_permute2x128_si256(v_tmp64_lo_e[2], v_tmp64_lo_e[3], 0x31);
+  v_src[21] = _mm256_permute2x128_si256(v_tmp64_hi_e[2], v_tmp64_hi_e[3], 0x31);
+  v_src[22] = _mm256_permute2x128_si256(v_tmp64_lo_e[6], v_tmp64_lo_e[7], 0x31);
+  v_src[23] = _mm256_permute2x128_si256(v_tmp64_hi_e[6], v_tmp64_hi_e[7], 0x31);
+
+  v_src[24] = _mm256_permute2x128_si256(v_tmp64_lo_o[2], v_tmp64_lo_o[3], 0x20);
+  v_src[25] = _mm256_permute2x128_si256(v_tmp64_hi_o[2], v_tmp64_hi_o[3], 0x20);
+  v_src[26] = _mm256_permute2x128_si256(v_tmp64_lo_o[6], v_tmp64_lo_o[7], 0x20);
+  v_src[27] = _mm256_permute2x128_si256(v_tmp64_hi_o[6], v_tmp64_hi_o[7], 0x20);
+
+  v_src[28] = _mm256_permute2x128_si256(v_tmp64_lo_o[2], v_tmp64_lo_o[3], 0x31);
+  v_src[29] = _mm256_permute2x128_si256(v_tmp64_hi_o[2], v_tmp64_hi_o[3], 0x31);
+  v_src[30] = _mm256_permute2x128_si256(v_tmp64_lo_o[6], v_tmp64_lo_o[7], 0x31);
+  v_src[31] = _mm256_permute2x128_si256(v_tmp64_hi_o[6], v_tmp64_hi_o[7], 0x31);
+
+  __m256i v_trunc[64];
+  __m256i* v_src_ptr = v_src;
+  __m256i* v_tr_ptr = v_trunc;
+
+
+  for (int chunk = 0; chunk < 2; ++chunk) {
+    const int32_t* c_ptr = (const int32_t*)coeff; // Handle as 32 bit integer to load two coeffs into vector at the same time
+    for (int c = 0; c < 32; ++c) {
+      __m256i v_madd[16];
+      for (int i = 0; i < 16; ++i) {
+        const __m256i v_coeff = _mm256_set1_epi32(*c_ptr);
+        v_madd[i] = _mm256_madd_epi16(v_src_ptr[i], v_coeff);
+        c_ptr++;
+      }
+
+      __m256i v_add_0[8];
+      for (int d = 0, s = 0; d < 8; ++d, s += 2) {
+        v_add_0[d] = _mm256_add_epi32(v_madd[s + 0], v_madd[s + 1]);
+      }
+
+      __m256i v_add_1[4];
+      for (int d = 0, s = 0; d < 4; ++d, s += 2) {
+        v_add_1[d] = _mm256_add_epi32(v_add_0[s + 0], v_add_0[s + 1]);
+      }
+
+      __m256i v_add_2[2];
+      for (int d = 0, s = 0; d < 2; ++d, s += 2) {
+        v_add_2[d] = _mm256_add_epi32(v_add_1[s + 0], v_add_1[s + 1]);
+      }
+
+      v_tr_ptr[c] = truncate_avx2(_mm256_add_epi32(v_add_2[0], v_add_2[1]), debias, shift);
+    }
+    v_tr_ptr += 32;
+    v_src_ptr += 16;
+  }
+
+  __m256i v_tmp[32];
+  __m256i v_result[32];
+  for (int i = 0, s = 0; i < 32; ++i, s += 2) {
+    v_tmp[i] = _mm256_packs_epi32(v_trunc[s + 0], v_trunc[s + 1]);
+    v_tmp[i] = _mm256_shuffle_epi8(v_tmp[i], v_res_shuffle);
+  }
+
+  __m256i v_rtmp32_lo[16];
+  __m256i v_rtmp32_hi[16];
+  for (int d = 0, s = 0; d < 16; ++d, s += 2) {
+    v_rtmp32_lo[d] = _mm256_unpacklo_epi32(v_tmp[s + 0], v_tmp[s + 1]);
+    v_rtmp32_hi[d] = _mm256_unpackhi_epi32(v_tmp[s + 0], v_tmp[s + 1]);
+  }
+
+  __m256i v_rtmp64_lo[16];
+  __m256i v_rtmp64_hi[16];
+  for (int d = 0, s = 0; d < 8; ++d, s += 2) {
+    v_rtmp64_lo[0 + d] = _mm256_unpacklo_epi64(v_rtmp32_lo[s + 0], v_rtmp32_lo[s + 1]);
+    v_rtmp64_lo[8 + d] = _mm256_unpacklo_epi64(v_rtmp32_hi[s + 0], v_rtmp32_hi[s + 1]);
+
+    v_rtmp64_hi[0 + d] = _mm256_unpackhi_epi64(v_rtmp32_lo[s + 0], v_rtmp32_lo[s + 1]);
+    v_rtmp64_hi[8 + d] = _mm256_unpackhi_epi64(v_rtmp32_hi[s + 0], v_rtmp32_hi[s + 1]);
+  }
+
+  v_result[0] = _mm256_permute2x128_si256(v_rtmp64_lo[0], v_rtmp64_lo[1], 0x20);
+  v_result[1] = _mm256_permute2x128_si256(v_rtmp64_lo[2], v_rtmp64_lo[3], 0x20);
+  v_result[2] = _mm256_permute2x128_si256(v_rtmp64_hi[0], v_rtmp64_hi[1], 0x20);
+  v_result[3] = _mm256_permute2x128_si256(v_rtmp64_hi[2], v_rtmp64_hi[3], 0x20);
+
+  v_result[4] = _mm256_permute2x128_si256(v_rtmp64_lo[8], v_rtmp64_lo[9], 0x20);
+  v_result[5] = _mm256_permute2x128_si256(v_rtmp64_lo[10], v_rtmp64_lo[11], 0x20);
+  v_result[6] = _mm256_permute2x128_si256(v_rtmp64_hi[8], v_rtmp64_hi[9], 0x20);
+  v_result[7] = _mm256_permute2x128_si256(v_rtmp64_hi[10], v_rtmp64_hi[11], 0x20);
+
+  v_result[8] = _mm256_permute2x128_si256(v_rtmp64_lo[0], v_rtmp64_lo[1], 0x31);
+  v_result[9] = _mm256_permute2x128_si256(v_rtmp64_lo[2], v_rtmp64_lo[3], 0x31);
+  v_result[10] = _mm256_permute2x128_si256(v_rtmp64_hi[0], v_rtmp64_hi[1], 0x31);
+  v_result[11] = _mm256_permute2x128_si256(v_rtmp64_hi[2], v_rtmp64_hi[3], 0x31);
+
+  v_result[12] = _mm256_permute2x128_si256(v_rtmp64_lo[8], v_rtmp64_lo[9], 0x31);
+  v_result[13] = _mm256_permute2x128_si256(v_rtmp64_lo[10], v_rtmp64_lo[11], 0x31);
+  v_result[14] = _mm256_permute2x128_si256(v_rtmp64_hi[8], v_rtmp64_hi[9], 0x31);
+  v_result[15] = _mm256_permute2x128_si256(v_rtmp64_hi[10], v_rtmp64_hi[11], 0x31);
+
+  v_result[16] = _mm256_permute2x128_si256(v_rtmp64_lo[4], v_rtmp64_lo[5], 0x20);
+  v_result[17] = _mm256_permute2x128_si256(v_rtmp64_lo[6], v_rtmp64_lo[7], 0x20);
+  v_result[18] = _mm256_permute2x128_si256(v_rtmp64_hi[4], v_rtmp64_hi[5], 0x20);
+  v_result[19] = _mm256_permute2x128_si256(v_rtmp64_hi[6], v_rtmp64_hi[7], 0x20);
+
+  v_result[20] = _mm256_permute2x128_si256(v_rtmp64_lo[12], v_rtmp64_lo[13], 0x20);
+  v_result[21] = _mm256_permute2x128_si256(v_rtmp64_lo[14], v_rtmp64_lo[15], 0x20);
+  v_result[22] = _mm256_permute2x128_si256(v_rtmp64_hi[12], v_rtmp64_hi[13], 0x20);
+  v_result[23] = _mm256_permute2x128_si256(v_rtmp64_hi[14], v_rtmp64_hi[15], 0x20);
+
+  v_result[24] = _mm256_permute2x128_si256(v_rtmp64_lo[4], v_rtmp64_lo[5], 0x31);
+  v_result[25] = _mm256_permute2x128_si256(v_rtmp64_lo[6], v_rtmp64_lo[7], 0x31);
+  v_result[26] = _mm256_permute2x128_si256(v_rtmp64_hi[4], v_rtmp64_hi[5], 0x31);
+  v_result[27] = _mm256_permute2x128_si256(v_rtmp64_hi[6], v_rtmp64_hi[7], 0x31);
+
+  v_result[28] = _mm256_permute2x128_si256(v_rtmp64_lo[12], v_rtmp64_lo[13], 0x31);
+  v_result[29] = _mm256_permute2x128_si256(v_rtmp64_lo[14], v_rtmp64_lo[15], 0x31);
+  v_result[30] = _mm256_permute2x128_si256(v_rtmp64_hi[12], v_rtmp64_hi[13], 0x31);
+  v_result[31] = _mm256_permute2x128_si256(v_rtmp64_hi[14], v_rtmp64_hi[15], 0x31);
+
+  for (int i = 0; i < 32; ++i) {
+    _mm256_store_si256((__m256i*)dst, v_result[i]);
+    dst += 16;
+  }
+
+  // TODO: MTS cutoff
+}
+
+void fast_inverse_tr_32x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 32;
+  const int height = 16;
+
+  int skip_width = (hor != DCT2 && width == 32) ? 16 : width > 32 ? width - 32 : 0;
+  int skip_height = 0;
+
+  const int32_t shift_1st = INVERSE_SHIFT_1ST;
+  const int32_t shift_2nd = INVERSE_SHIFT_2ND;
+
+  const int16_t* ver_coeff = fi_dct2_32x16_coeff_ver;
+  const int16_t* hor_coeff = fi_dct2_32xN_coeff_hor;
+  if (hor == DST7) {
+    hor_coeff = fi_dst7_32xN_coeff_hor;
+  } else if (hor == DCT8) {
+    hor_coeff = fi_dct8_32xN_coeff_hor;
+  }
+  if (ver == DST7) {
+    ver_coeff = fi_dst7_32x16_coeff_ver;
+  } else if (ver == DCT8) {
+    ver_coeff = fi_dct8_32x16_coeff_ver;
+  }
+
+  __m256i v_ver_pass_out[32];
+  fast_inverse_tr_32x16_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height);
+
+  fast_inverse_tr_32x8_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width);
+}
+
+
+void fast_forward_tr_32x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 32;
+  const int height = 32;
+
+  int skip_width = (hor != DCT2 && width == 32) ? 16 : width > 32 ? width - 32 : 0;
+  int skip_height = (ver != DCT2 && height == 32) ? 16 : (height > 32 ? height - 32 : 0);
+
+  const int  reduced_line = width - skip_width;
+  const int  cutoff = height - skip_height;
+  int16_t* p_dst = dst;
+
+  const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1;
+  const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
+  const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8;
+  const int32_t shift_2nd = log2_height_minus1 + 7;
+
+  const int16_t* hor_coeff = ff_dct2_32xN_coeff_hor;
+  const int16_t* ver_coeff = ff_dct2_32x32_coeff_ver;
+  if (hor == DST7) {
+    hor_coeff = ff_dst7_32xN_coeff_hor;
+  } else if (hor == DCT8) {
+    hor_coeff = ff_dct8_32xN_coeff_hor;
+  }
+  if (ver == DST7) {
+    ver_coeff = ff_dst7_32x32_coeff_ver;
+  } else if (ver == DCT8) {
+    ver_coeff = ff_dct8_32x32_coeff_ver;
+  }
+
+  ALIGNED(32) int16_t v_hor_pass_out[32 * 32];
+  if(hor == DCT2) {
+    fast_forward_DCT2_B32_avx2_hor(src, (__m256i*)v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);
+  }
+  else {
+    fast_forward_DCT8_B32_avx2_hor(src, (__m256i*)v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);    
+  }
+
+  __m256i temp_out[32 * 2];
+  // Vertical pass
+  const int32_t    add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const __m256i debias = _mm256_set1_epi32(add);
+  for (int j = 0; j < reduced_line; ++j) {
+    __m256i res_0 = _mm256_setzero_si256();
+    __m256i res_1 = _mm256_setzero_si256();
+    __m256i res_2 = _mm256_setzero_si256();
+    __m256i res_3 = _mm256_setzero_si256();
+    const int16_t* coeff_start = ver_coeff;
+    for (int i = 0; i < 16; ++i) {
+      int16_t source[2];
+      source[0] = v_hor_pass_out[j + i * 64];
+      source[1] = v_hor_pass_out[j + i * 64 + 32];
+      int32_t paired_source;
+      memcpy(&paired_source, source, sizeof(int32_t));
+
+      __m256i v_src = _mm256_set1_epi32(paired_source);
+      __m256i v_coeff_0 = _mm256_load_si256((__m256i*) coeff_start);
+      coeff_start += 16;
+      __m256i v_coeff_1 = _mm256_load_si256((__m256i*) coeff_start);
+      __m256i v_coeff_2;
+      __m256i v_coeff_3;
+      if(skip_height == 0) {
+        coeff_start += 16;
+        v_coeff_2 = _mm256_load_si256((__m256i*) coeff_start);
+        coeff_start += 16;
+        v_coeff_3 = _mm256_load_si256((__m256i*) coeff_start);
+        coeff_start += 16;
+      }
+      else {
+        coeff_start += 48;
+      }
+
+      __m256i madd_0 = _mm256_madd_epi16(v_src, v_coeff_0);
+      __m256i madd_1 = _mm256_madd_epi16(v_src, v_coeff_1);
+      __m256i madd_2;
+      __m256i madd_3;
+      if(skip_height == 0) {
+        madd_2 = _mm256_madd_epi16(v_src, v_coeff_2);
+        madd_3 = _mm256_madd_epi16(v_src, v_coeff_3);
+      }
+
+      res_0 = _mm256_add_epi32(res_0, madd_0);
+      res_1 = _mm256_add_epi32(res_1, madd_1);
+      if(skip_height == 0) {
+        res_2 = _mm256_add_epi32(res_2, madd_2);
+        res_3 = _mm256_add_epi32(res_3, madd_3);
+      }
+    }
+    __m256i v_trunc_0 = truncate_avx2(res_0, debias, shift_2nd);
+    __m256i v_trunc_1 = truncate_avx2(res_1, debias, shift_2nd);
+    __m256i v_trunc_2;
+    __m256i v_trunc_3;
+    if(skip_height == 0) {
+      v_trunc_2 = truncate_avx2(res_2, debias, shift_2nd);
+      v_trunc_3 = truncate_avx2(res_3, debias, shift_2nd);
+    }
+
+    v_trunc_0 = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+    v_trunc_0 = _mm256_permute4x64_epi64(v_trunc_0, _MM_SHUFFLE(3, 1, 2, 0));
+    _mm256_store_si256(temp_out + 2 * j, v_trunc_0);
+    if(skip_height == 0) {
+      v_trunc_2 = _mm256_packs_epi32(v_trunc_2, v_trunc_3);
+      v_trunc_2 = _mm256_permute4x64_epi64(v_trunc_2, _MM_SHUFFLE(3, 1, 2, 0));
+      _mm256_store_si256(temp_out + 2 * j + 1, v_trunc_2);
+    }
+  }
+  transpose_avx2(temp_out, (__m256i*) dst, 32, 32);
+#if 0
+  // 8 is probably best, though difference to 16 is not that large
+#define NUM_PARTS 8
+#define PART_DIMENSION (32 / NUM_PARTS)
+  for (int part = 0; part < NUM_PARTS; ++part) {
+    const int32_t* coeff_ptr = (const int32_t*)ver_coeff + part * PART_DIMENSION; // Cast to 32 bit integer to read 2 coeffs at a time
+    const __m256i* v_src_ptr = v_hor_pass_out;
+
+    __m256i v_madd_lo_e[16][PART_DIMENSION];
+    __m256i v_madd_lo_o[16][PART_DIMENSION];
+    __m256i v_madd_hi_e[16][PART_DIMENSION];
+    __m256i v_madd_hi_o[16][PART_DIMENSION];
+    for (int i = 0; i < 16; ++i) {
+      __m256i v_src_lo_e = _mm256_unpacklo_epi16(v_src_ptr[0], v_src_ptr[2]);
+      __m256i v_src_lo_o = _mm256_unpacklo_epi16(v_src_ptr[1], v_src_ptr[3]);
+      __m256i v_src_hi_e = _mm256_unpackhi_epi16(v_src_ptr[0], v_src_ptr[2]);
+      __m256i v_src_hi_o = _mm256_unpackhi_epi16(v_src_ptr[1], v_src_ptr[3]);
+
+
+      for (int c = 0; c < PART_DIMENSION; ++c) {
+        const __m256i v_coeff = _mm256_set1_epi32(coeff_ptr[c]);
+        v_madd_lo_e[i][c] = _mm256_madd_epi16(v_src_lo_e, v_coeff);
+        v_madd_lo_o[i][c] = _mm256_madd_epi16(v_src_lo_o, v_coeff);
+        v_madd_hi_e[i][c] = _mm256_madd_epi16(v_src_hi_e, v_coeff);
+        v_madd_hi_o[i][c] = _mm256_madd_epi16(v_src_hi_o, v_coeff);
+      }
+      coeff_ptr += 32;
+      v_src_ptr += 4;
+    }
+
+    for (int c = 0; c < PART_DIMENSION; ++c) {
+      __m256i v_add_lo_e0[8];
+      __m256i v_add_lo_o0[8];
+      __m256i v_add_hi_e0[8];
+      __m256i v_add_hi_o0[8];
+      for (int dst = 0, src = 0; dst < 8; ++dst, src += 2) {
+        v_add_lo_e0[dst] = _mm256_add_epi32(v_madd_lo_e[src + 0][c], v_madd_lo_e[src + 1][c]);
+        v_add_lo_o0[dst] = _mm256_add_epi32(v_madd_lo_o[src + 0][c], v_madd_lo_o[src + 1][c]);
+        v_add_hi_e0[dst] = _mm256_add_epi32(v_madd_hi_e[src + 0][c], v_madd_hi_e[src + 1][c]);
+        v_add_hi_o0[dst] = _mm256_add_epi32(v_madd_hi_o[src + 0][c], v_madd_hi_o[src + 1][c]);      
+      }
+
+      __m256i v_add_lo_e1[4];
+      __m256i v_add_lo_o1[4];
+      __m256i v_add_hi_e1[4];
+      __m256i v_add_hi_o1[4];
+      for (int dst = 0, src = 0; dst < 4; ++dst, src += 2) {
+        v_add_lo_e1[dst] = _mm256_add_epi32(v_add_lo_e0[src + 0], v_add_lo_e0[src + 1]);
+        v_add_lo_o1[dst] = _mm256_add_epi32(v_add_lo_o0[src + 0], v_add_lo_o0[src + 1]);
+        v_add_hi_e1[dst] = _mm256_add_epi32(v_add_hi_e0[src + 0], v_add_hi_e0[src + 1]);
+        v_add_hi_o1[dst] = _mm256_add_epi32(v_add_hi_o0[src + 0], v_add_hi_o0[src + 1]);      
+      }
+
+      __m256i v_add_lo_e2[2];
+      __m256i v_add_lo_o2[2];
+      __m256i v_add_hi_e2[2];
+      __m256i v_add_hi_o2[2];
+      for (int dst = 0, src = 0; dst < 2; ++dst, src += 2) {
+        v_add_lo_e2[dst] = _mm256_add_epi32(v_add_lo_e1[src + 0], v_add_lo_e1[src + 1]);
+        v_add_lo_o2[dst] = _mm256_add_epi32(v_add_lo_o1[src + 0], v_add_lo_o1[src + 1]);
+        v_add_hi_e2[dst] = _mm256_add_epi32(v_add_hi_e1[src + 0], v_add_hi_e1[src + 1]);
+        v_add_hi_o2[dst] = _mm256_add_epi32(v_add_hi_o1[src + 0], v_add_hi_o1[src + 1]);
+      }
+
+      __m256i v_trunc_lo_e = truncate_avx2(_mm256_add_epi32(v_add_lo_e2[0], v_add_lo_e2[1]), debias, shift_2nd);
+      __m256i v_trunc_lo_o = truncate_avx2(_mm256_add_epi32(v_add_lo_o2[0], v_add_lo_o2[1]), debias, shift_2nd);
+      __m256i v_trunc_hi_e = truncate_avx2(_mm256_add_epi32(v_add_hi_e2[0], v_add_hi_e2[1]), debias, shift_2nd);
+      __m256i v_trunc_hi_o = truncate_avx2(_mm256_add_epi32(v_add_hi_o2[0], v_add_hi_o2[1]), debias, shift_2nd);
+
+      __m256i v_result_e = _mm256_packs_epi32(v_trunc_lo_e, v_trunc_hi_e);
+      __m256i v_result_o = _mm256_packs_epi32(v_trunc_lo_o, v_trunc_hi_o);
+
+      v_result_e = _mm256_permute4x64_epi64(v_result_e, _MM_SHUFFLE(3, 1, 2, 0));
+      v_result_o = _mm256_permute4x64_epi64(v_result_o, _MM_SHUFFLE(3, 1, 2, 0));
+
+      _mm256_store_si256((__m256i*)dst, v_result_e);
+      dst += 16;
+      _mm256_store_si256((__m256i*)dst, v_result_o);
+      dst += 16;
+    }
+  }
+#undef NUM_PARTS
+#undef PART_DIMENSION
+#endif
+
+  if (skip_width) {
+    dst = p_dst + reduced_line;
+    for (int j = 0; j < cutoff; j++)
+    {
+      memset(dst, 0, sizeof(int16_t) * skip_width);
+      dst += width;
+    }
+  }
+
+  if (skip_height) {
+    dst = p_dst + width * cutoff;
+    memset(dst, 0, sizeof(int16_t) * width * skip_height);
+  }
+}
+
+
+static void fast_inverse_tr_32x32_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const int32_t* c_ptr = (const int32_t*)coeff; // Handle as 32 bit integer to load two coeffs into vector at the same time
+  const __m256i* v_src_raw = (const __m256i*)src;
+
+  __m256i v_src[16][4];
+  for (int d = 0, s = 0; d < 16; ++d, s += 4) {
+    v_src[d][0] = _mm256_unpacklo_epi16(v_src_raw[s + 0], v_src_raw[s + 2]);
+    v_src[d][1] = _mm256_unpackhi_epi16(v_src_raw[s + 0], v_src_raw[s + 2]);
+    v_src[d][2] = _mm256_unpacklo_epi16(v_src_raw[s + 1], v_src_raw[s + 3]);
+    v_src[d][3] = _mm256_unpackhi_epi16(v_src_raw[s + 1], v_src_raw[s + 3]);
+  }
+
+  for (int row = 0, d = 0; row < 32; ++row, d += 2) {
+    __m256i v_res_0 = _mm256_setzero_si256();
+    __m256i v_res_1 = _mm256_setzero_si256();
+    __m256i v_res_2 = _mm256_setzero_si256();
+    __m256i v_res_3 = _mm256_setzero_si256();
+    if(skip_line == 0) {
+      for (int i = 0; i < 16; ++i) {
+        const __m256i v_coeff = _mm256_set1_epi32(*c_ptr);
+        __m256i v_madd_0 = _mm256_madd_epi16(v_src[i][0], v_coeff);
+        __m256i v_madd_1 = _mm256_madd_epi16(v_src[i][1], v_coeff);
+        __m256i v_madd_2 = _mm256_madd_epi16(v_src[i][2], v_coeff);
+        __m256i v_madd_3 = _mm256_madd_epi16(v_src[i][3], v_coeff);
+        v_res_0 = _mm256_add_epi32(v_res_0, v_madd_0);
+        v_res_1 = _mm256_add_epi32(v_res_1, v_madd_1);
+        v_res_2 = _mm256_add_epi32(v_res_2, v_madd_2);
+        v_res_3 = _mm256_add_epi32(v_res_3, v_madd_3);
+        c_ptr++;
+      }
+
+      __m256i v_trunc_0 = truncate_avx2(v_res_0, debias, shift);
+      __m256i v_trunc_1 = truncate_avx2(v_res_1, debias, shift);
+      __m256i v_trunc_2 = truncate_avx2(v_res_2, debias, shift);
+      __m256i v_trunc_3 = truncate_avx2(v_res_3, debias, shift);
+
+      dst[d + 0] = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+      dst[d + 1] = _mm256_packs_epi32(v_trunc_2, v_trunc_3);
+    }
+    else {
+      for (int i = 0; i < 16; ++i) {
+        const __m256i v_coeff = _mm256_set1_epi32(*c_ptr);
+        __m256i v_madd_0 = _mm256_madd_epi16(v_src[i][0], v_coeff);
+        __m256i v_madd_1 = _mm256_madd_epi16(v_src[i][1], v_coeff);
+        v_res_0 = _mm256_add_epi32(v_res_0, v_madd_0);
+        v_res_1 = _mm256_add_epi32(v_res_1, v_madd_1);
+        c_ptr++;
+      }
+
+      __m256i v_trunc_0 = truncate_avx2(v_res_0, debias, shift);
+      __m256i v_trunc_1 = truncate_avx2(v_res_1, debias, shift);
+
+      dst[d + 0] = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+      dst[d + 1] = _mm256_setzero_si256();
+    }
+  }
+}
+
+static void fast_inverse_tr_32x32_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const int32_t* c_ptr = (const int32_t*)coeff; // Handle as 32 bit integer to load two coeffs into vector at the same time
+
+  // Do a 32 bit transpose on input
+  __m256i v_tmp32_lo[32];
+  __m256i v_tmp32_hi[32];
+  for (int d = 0, s = 0; d < 32; d += 2, s += 4) {
+    v_tmp32_lo[d + 0] = _mm256_unpacklo_epi32(src[s + 0], src[s + 2]);
+    v_tmp32_lo[d + 1] = _mm256_unpacklo_epi32(src[s + 1], src[s + 3]);
+    v_tmp32_hi[d + 0] = _mm256_unpackhi_epi32(src[s + 0], src[s + 2]);
+    v_tmp32_hi[d + 1] = _mm256_unpackhi_epi32(src[s + 1], src[s + 3]);
+  }
+
+  __m256i v_tmp64_lo[32];
+  __m256i v_tmp64_hi[32];
+  for (int i = 0; i < 32; i += 4) {
+    v_tmp64_lo[i + 0] = _mm256_unpacklo_epi64(v_tmp32_lo[i + 0], v_tmp32_lo[i + 2]);
+    v_tmp64_lo[i + 1] = _mm256_unpacklo_epi64(v_tmp32_lo[i + 1], v_tmp32_lo[i + 3]);
+    v_tmp64_lo[i + 2] = _mm256_unpacklo_epi64(v_tmp32_hi[i + 0], v_tmp32_hi[i + 2]);
+    v_tmp64_lo[i + 3] = _mm256_unpacklo_epi64(v_tmp32_hi[i + 1], v_tmp32_hi[i + 3]);
+
+    v_tmp64_hi[i + 0] = _mm256_unpackhi_epi64(v_tmp32_lo[i + 0], v_tmp32_lo[i + 2]);
+    v_tmp64_hi[i + 1] = _mm256_unpackhi_epi64(v_tmp32_lo[i + 1], v_tmp32_lo[i + 3]);
+    v_tmp64_hi[i + 2] = _mm256_unpackhi_epi64(v_tmp32_hi[i + 0], v_tmp32_hi[i + 2]);
+    v_tmp64_hi[i + 3] = _mm256_unpackhi_epi64(v_tmp32_hi[i + 1], v_tmp32_hi[i + 3]);
+  }
+
+  __m256i v_src[64];
+  for (int d = 0, s = 0; d < 64; d += 16, s += 8) {
+    v_src[d + 0]  = _mm256_permute2x128_si256(v_tmp64_lo[s + 0], v_tmp64_lo[s + 4], 0x20);
+    v_src[d + 1]  = _mm256_permute2x128_si256(v_tmp64_hi[s + 0], v_tmp64_hi[s + 4], 0x20);
+    v_src[d + 2]  = _mm256_permute2x128_si256(v_tmp64_lo[s + 2], v_tmp64_lo[s + 6], 0x20);
+    v_src[d + 3]  = _mm256_permute2x128_si256(v_tmp64_hi[s + 2], v_tmp64_hi[s + 6], 0x20);
+
+    v_src[d + 4]  = _mm256_permute2x128_si256(v_tmp64_lo[s + 0], v_tmp64_lo[s + 4], 0x31);
+    v_src[d + 5]  = _mm256_permute2x128_si256(v_tmp64_hi[s + 0], v_tmp64_hi[s + 4], 0x31);
+    v_src[d + 6]  = _mm256_permute2x128_si256(v_tmp64_lo[s + 2], v_tmp64_lo[s + 6], 0x31);
+    v_src[d + 7]  = _mm256_permute2x128_si256(v_tmp64_hi[s + 2], v_tmp64_hi[s + 6], 0x31);
+
+    v_src[d + 8]  = _mm256_permute2x128_si256(v_tmp64_lo[s + 1], v_tmp64_lo[s + 5], 0x20);
+    v_src[d + 9]  = _mm256_permute2x128_si256(v_tmp64_hi[s + 1], v_tmp64_hi[s + 5], 0x20);
+    v_src[d + 10] = _mm256_permute2x128_si256(v_tmp64_lo[s + 3], v_tmp64_lo[s + 7], 0x20);
+    v_src[d + 11] = _mm256_permute2x128_si256(v_tmp64_hi[s + 3], v_tmp64_hi[s + 7], 0x20);
+
+    v_src[d + 12] = _mm256_permute2x128_si256(v_tmp64_lo[s + 1], v_tmp64_lo[s + 5], 0x31);
+    v_src[d + 13] = _mm256_permute2x128_si256(v_tmp64_hi[s + 1], v_tmp64_hi[s + 5], 0x31);
+    v_src[d + 14] = _mm256_permute2x128_si256(v_tmp64_lo[s + 3], v_tmp64_lo[s + 7], 0x31);
+    v_src[d + 15] = _mm256_permute2x128_si256(v_tmp64_hi[s + 3], v_tmp64_hi[s + 7], 0x31);
+  }
+
+  __m256i v_tmp[64];
+  for (int row = 0, d = 0; row < 32; ++row, d += 2) {
+    __m256i v_res_0 = _mm256_setzero_si256();
+    __m256i v_res_1 = _mm256_setzero_si256();
+    __m256i v_res_2 = _mm256_setzero_si256();
+    __m256i v_res_3 = _mm256_setzero_si256();
+    for (int i = 0; i < 16; ++i) {
+      const __m256i v_coeff = _mm256_set1_epi32(*c_ptr);
+      __m256i v_madd_0 = _mm256_madd_epi16(v_src[i + 0], v_coeff);
+      __m256i v_madd_1 = _mm256_madd_epi16(v_src[i + 16], v_coeff);
+      __m256i v_madd_2 = _mm256_madd_epi16(v_src[i + 32], v_coeff);
+      __m256i v_madd_3 = _mm256_madd_epi16(v_src[i + 48], v_coeff);
+
+      v_res_0 = _mm256_add_epi32(v_madd_0, v_res_0);
+      v_res_1 = _mm256_add_epi32(v_madd_1, v_res_1);
+      v_res_2 = _mm256_add_epi32(v_madd_2, v_res_2);
+      v_res_3 = _mm256_add_epi32(v_madd_3, v_res_3);
+      c_ptr++;
+    }
+
+    __m256i v_trunc_0 = truncate_avx2(v_res_0, debias, shift);
+    __m256i v_trunc_1 = truncate_avx2(v_res_1, debias, shift);
+    __m256i v_trunc_2 = truncate_avx2(v_res_2, debias, shift);
+    __m256i v_trunc_3 = truncate_avx2(v_res_3, debias, shift);
+
+    v_tmp[d + 0] = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+    v_tmp[d + 1] = _mm256_packs_epi32(v_trunc_2, v_trunc_3);
+  }
+
+  for (int i = 0; i < 64; ++i) {
+    v_tmp[i] = _mm256_permute4x64_epi64(v_tmp[i], _MM_SHUFFLE(3, 1, 2, 0));
+  }
+
+  __m256i v_result[64];
+  transpose_avx2(v_tmp, v_result, 32, 32);
+
+  for (int i = 0; i < 64; ++i) {
+    _mm256_store_si256((__m256i*)dst, v_result[i]);
+    dst += 16;
+  }
+}
+
+void fast_inverse_tr_32x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 32;
+  const int height = 32;
+
+  int skip_width = (hor != DCT2 && width == 32) ? 16 : width > 32 ? width - 32 : 0;
+  int skip_height = (ver != DCT2 && height == 32) ? 16 : (height > 32 ? height - 32 : 0);
+
+  const int32_t shift_1st = INVERSE_SHIFT_1ST;
+  const int32_t shift_2nd = INVERSE_SHIFT_2ND;
+
+  const int16_t* ver_coeff = uvg_g_dct_32_t;
+  const int16_t* hor_coeff = fi_dct2_32xN_coeff_hor;
+  if (hor == DST7) {
+    hor_coeff = fi_dst7_32xN_coeff_hor;
+  } else if (hor == DCT8) {
+    hor_coeff = fi_dct8_32xN_coeff_hor;
+  }
+  if (ver == DST7) {
+    ver_coeff = uvg_g_dst7_32_t;
+  } else if (ver == DCT8) {
+    ver_coeff = uvg_g_dct8_32;
+  }
+
+  __m256i v_ver_pass_out[64];
+  fast_inverse_tr_32x32_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height);
+
+  fast_inverse_tr_32x8_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width);
+}
+
+
+static dct_full_pass* dct_function_table[6][6] = {
+  { NULL,                      NULL,                      fast_forward_tr_2x8_avx2,  fast_forward_tr_2x16_avx2,  fast_forward_tr_2x32_avx2,  NULL },
+  { NULL,                      fast_forward_tr_4x4_avx2,  fast_forward_tr_4x8_avx2,  fast_forward_tr_4x16_avx2,  fast_forward_tr_4x32_avx2,  NULL },
+  { fast_forward_tr_8x2_avx2,  fast_forward_tr_8x4_avx2,  fast_forward_tr_8x8_avx2,  fast_forward_tr_8x16_avx2,  fast_forward_tr_8x32_avx2,  NULL },
+  { fast_forward_tr_16x2_avx2, fast_forward_tr_16x4_avx2, fast_forward_tr_16x8_avx2, fast_forward_tr_16x16_avx2, fast_forward_tr_16x32_avx2, NULL },
+  { fast_forward_tr_32x2_avx2, fast_forward_tr_32x4_avx2, fast_forward_tr_32x8_avx2, fast_forward_tr_32x16_avx2, fast_forward_tr_32x32_avx2, NULL },
+  { NULL,                      NULL,                      NULL,                      NULL,                       NULL,                       NULL }
+};
+
+
+static dct_full_pass* idct_function_table[6][6] = {
+  { NULL,                      NULL,                      fast_inverse_tr_2x8_avx2,  fast_inverse_tr_2x16_avx2,  fast_inverse_tr_2x32_avx2,  NULL },
+  { NULL,                      fast_inverse_tr_4x4_avx2,  fast_inverse_tr_4x8_avx2,  fast_inverse_tr_4x16_avx2,  fast_inverse_tr_4x32_avx2,  NULL },
+  { fast_inverse_tr_8x2_avx2,  fast_inverse_tr_8x4_avx2,  fast_inverse_tr_8x8_avx2,  fast_inverse_tr_8x16_avx2,  fast_inverse_tr_8x32_avx2,  NULL },
+  { fast_inverse_tr_16x2_avx2, fast_inverse_tr_16x4_avx2, fast_inverse_tr_16x8_avx2, fast_inverse_tr_16x16_avx2, fast_inverse_tr_16x32_avx2, NULL },
+  { fast_inverse_tr_32x2_avx2, fast_inverse_tr_32x4_avx2, fast_inverse_tr_32x8_avx2, fast_inverse_tr_32x16_avx2, fast_inverse_tr_32x32_avx2, NULL },
+  { NULL,                      NULL,                      NULL,                      NULL,                       NULL,                       NULL },
+};
+
 
 extern void uvg_get_tr_type(
   int8_t width,
@@ -1606,13 +8074,27 @@ static void mts_dct_avx2(
     dct_func* dct_func = uvg_get_dct_func(width, height, color, tu->type);
     dct_func(bitdepth, input, output);
   }
-  else
-  {
-    const int log2_width_minus2 = uvg_g_convert_to_bit[width];
-
-    tr_func* dct = dct_table[log2_width_minus2];
-
-    dct(input, output, type_hor, type_ver, bitdepth, tu->lfnst_idx);
+  else{
+    const int log2_width_minus1  = uvg_g_convert_to_log2[width] - 1;
+    const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
+    if (height == 1) {
+      if (width == 16) {
+        fast_forward_DCT2_B16_avx2_hor(input, (__m256i*)output, type_hor == DCT2 ? ff_dct2_16xN_coeff_hor : ff_dst7_16xN_coeff_hor, 3, 1, 0, 0);
+      } else if (width == 32) {
+        fast_forward_DCT2_B32_avx2_hor(input, (__m256i*)output, ff_dct2_32xN_coeff_hor, 4, 1, 0, 0);        
+      }
+    }
+    else if (width == 1){
+      if (height == 16) {
+        fast_forward_DCT2_B16_avx2_hor(input, (__m256i*)output, type_ver == DCT2 ? ff_dct2_16xN_coeff_hor : ff_dst7_16xN_coeff_hor, 3, 1, 0, 0);
+      } else if (height == 32) {
+        fast_forward_DCT2_B32_avx2_hor(input, (__m256i*)output, ff_dct2_32xN_coeff_hor, 4, 1, 0, 0);        
+      }
+    }
+    else {
+      dct_full_pass* dct_func = dct_function_table[log2_width_minus1][log2_height_minus1];
+      dct_func(input, output, type_hor, type_ver);
+    }
   }
 }
 
@@ -1637,13 +8119,12 @@ static void mts_idct_avx2(
     dct_func* idct_func = uvg_get_idct_func(width, height, color, tu->type);
     idct_func(bitdepth, input, output);
   }
-  else
-  {
-    const int log2_width_minus2 = uvg_g_convert_to_bit[width];
+  else {
+    const int log2_width_minus1  = uvg_g_convert_to_log2[width] - 1;
+    const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
 
-    tr_func* idct = idct_table[log2_width_minus2];
-
-    idct(input, output, type_hor, type_ver, bitdepth, tu->lfnst_idx);
+    dct_full_pass* idct_func = idct_function_table[log2_width_minus1][log2_height_minus1];
+    idct_func(input, output, type_hor, type_ver);
   }
 }
 
@@ -1658,19 +8139,19 @@ int uvg_strategy_register_dct_avx2(void* opaque, uint8_t bitdepth)
   if (bitdepth == 8){
     //success &= uvg_strategyselector_register(opaque, "fast_forward_dst_4x4", "avx2", 40, &matrix_dst_4x4_avx2);
 
-    //success &= uvg_strategyselector_register(opaque, "dct_4x4", "avx2", 40, &matrix_dct_4x4_avx2);
-    //success &= uvg_strategyselector_register(opaque, "dct_8x8", "avx2", 40, &matrix_dct_8x8_avx2);
-    //success &= uvg_strategyselector_register(opaque, "dct_16x16", "avx2", 40, &matrix_dct_16x16_avx2);
-    //success &= uvg_strategyselector_register(opaque, "dct_32x32", "avx2", 40, &matrix_dct_32x32_avx2);
+    success &= uvg_strategyselector_register(opaque, "dct_4x4", "avx2", 40, &matrix_dct_4x4_avx2);
+    success &= uvg_strategyselector_register(opaque, "dct_8x8", "avx2", 40, &matrix_dct_8x8_avx2);
+    success &= uvg_strategyselector_register(opaque, "dct_16x16", "avx2", 40, &matrix_dct_16x16_avx2);
+    success &= uvg_strategyselector_register(opaque, "dct_32x32", "avx2", 40, &matrix_dct_32x32_avx2);
 
-    //success &= uvg_strategyselector_register(opaque, "fast_inverse_dst_4x4", "avx2", 40, &matrix_idst_4x4_avx2);
+    // success &= uvg_strategyselector_register(opaque, "fast_inverse_dst_4x4", "avx2", 40, &matrix_idst_4x4_avx2);
 
-    //success &= uvg_strategyselector_register(opaque, "idct_4x4", "avx2", 40, &matrix_idct_4x4_avx2);
-    //success &= uvg_strategyselector_register(opaque, "idct_8x8", "avx2", 40, &matrix_idct_8x8_avx2);
-    //success &= uvg_strategyselector_register(opaque, "idct_16x16", "avx2", 40, &matrix_idct_16x16_avx2);
-    //success &= uvg_strategyselector_register(opaque, "idct_32x32", "avx2", 40, &matrix_idct_32x32_avx2);
+    success &= uvg_strategyselector_register(opaque, "idct_4x4", "avx2", 40, &matrix_idct_4x4_avx2);
+    success &= uvg_strategyselector_register(opaque, "idct_8x8", "avx2", 40, &matrix_idct_8x8_avx2);
+    success &= uvg_strategyselector_register(opaque, "idct_16x16", "avx2", 40, &matrix_idct_16x16_avx2);
+    success &= uvg_strategyselector_register(opaque, "idct_32x32", "avx2", 40, &matrix_idct_32x32_avx2);
 
-    //success &= uvg_strategyselector_register(opaque, "mts_dct", "avx2", 40, &mts_dct_avx2);
+    success &= uvg_strategyselector_register(opaque, "mts_dct", "avx2", 40, &mts_dct_avx2);
     //success &= uvg_strategyselector_register(opaque, "mts_idct", "avx2", 40, &mts_idct_avx2);
 
   }
diff --git a/src/strategies/avx2/dct_avx2_tables.h b/src/strategies/avx2/dct_avx2_tables.h
new file mode 100644
index 00000000..2233916b
--- /dev/null
+++ b/src/strategies/avx2/dct_avx2_tables.h
@@ -0,0 +1,4785 @@
+#ifndef DCT_AVX2_TABLES_H
+#define DCT_AVX2_TABLES_H
+
+#include "global.h"
+
+// Shuffle tables for simple avx2 functions
+
+ALIGNED(32) const int32_t  ff_dct2_b4_permute_0[8] = { 0, 2, 4, 6, 0, 2, 4, 6 };
+ALIGNED(32) const int32_t  ff_dct2_b4_permute_1[8] = { 1, 3, 5, 7, 1, 3, 5, 7 };
+
+ALIGNED(32) const int32_t  fi_dct2_b4_permute_0[8] = { 0, 0, 0, 0, 2, 2, 2, 2 };
+ALIGNED(32) const int32_t  fi_dct2_b4_permute_1[8] = { 4, 4, 4, 4, 6, 6, 6, 6 };
+ALIGNED(32) const int32_t  fi_dct2_b4_permute_2[8] = { 1, 1, 1, 1, 3, 3, 3, 3 };
+ALIGNED(32) const int32_t  fi_dct2_b4_permute_3[8] = { 5, 5, 5, 5, 7, 7, 7, 7 };
+
+ALIGNED(32) const int32_t  ff_dct2_b32_permute[8][8] = {
+  {0, 0, 0, 0, 0, 0, 0, 0},
+  {1, 1, 1, 1, 1, 1, 1, 1},
+  {2, 2, 2, 2, 2, 2, 2, 2},
+  {3, 3, 3, 3, 3, 3, 3, 3},
+  {4, 4, 4, 4, 4, 4, 4, 4},
+  {5, 5, 5, 5, 5, 5, 5, 5},
+  {6, 6, 6, 6, 6, 6, 6, 6},
+  {7, 7, 7, 7, 7, 7, 7, 7},
+};
+
+
+// Coeff tables for simple avx2 functions
+
+ALIGNED(32) const int16_t  fast_forward_dct2_b2_coeff[32] = {
+  64,  64, 64,  64, 64,  64, 64,  64, 64,  64, 64,  64, 64,  64, 64,  64,
+  64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64,
+};
+
+            const int16_t* fast_inverse_dct2_b2_coeff = fast_forward_dct2_b2_coeff; // Inverse coeffs for this transform are same as forward
+
+// Coeff arrays for B4
+ALIGNED(32) const int16_t  fast_forward_dct2_b4_coeff[64] = {
+ 64,  64,  64,  64,  64,  64,  64,  64,  64, -64,  64, -64,  64, -64,  64, -64,
+ 64,  64,  64,  64,  64,  64,  64,  64, -64,  64, -64,  64, -64,  64, -64,  64,
+ 83,  36,  83,  36,  83,  36,  83,  36,  36, -83,  36, -83,  36, -83,  36, -83,
+-36, -83, -36, -83, -36, -83, -36, -83,  83, -36,  83, -36,  83, -36,  83, -36,
+};
+
+ALIGNED(32) const int16_t  fast_forward_dst7_b4_coeff[64] = {
+ 29,  55,  29,  55,  29,  55,  29,  55,  84, -29,  84, -29,  84, -29,  84, -29,
+ 74,  84,  74,  84,  74,  84,  74,  84, -74,  55, -74,  55, -74,  55, -74,  55,
+ 74,  74,  74,  74,  74,  74,  74,  74,  55, -84,  55, -84,  55, -84,  55, -84,
+  0, -74,   0, -74,   0, -74,   0, -74,  74, -29,  74, -29,  74, -29,  74, -29,
+};
+
+ALIGNED(32) const int16_t  fast_forward_dct8_b4_coeff[64] = {
+ 84,  74,  84,  74,  84,  74,  84,  74,  55, -74,  55, -74,  55, -74,  55, -74,
+ 55,  29,  55,  29,  55,  29,  55,  29, -29,  84, -29,  84, -29,  84, -29,  84,
+ 74,   0,  74,   0,  74,   0,  74,   0,  29, -74,  29, -74,  29, -74,  29, -74,
+-74, -74, -74, -74, -74, -74, -74, -74,  84, -55,  84, -55,  84, -55,  84, -55,
+};
+
+// Coeff arrays for inverse B4
+ALIGNED(32) const int16_t  fast_inverse_dct2_b4_coeff[64] = {
+ 64,  83,  64,  36,  64, -36,  64, -83,  64,  83,  64,  36,  64, -36,  64, -83,
+ 64,  36, -64, -83, -64,  83,  64, -36,  64,  36, -64, -83, -64,  83,  64, -36,
+ 64,  83,  64,  36,  64, -36,  64, -83,  64,  83,  64,  36,  64, -36,  64, -83,
+ 64,  36, -64, -83, -64,  83,  64, -36,  64,  36, -64, -83, -64,  83,  64, -36,
+};
+
+ALIGNED(32) const int16_t  fast_inverse_dst7_b4_coeff[64] = {
+ 29,  74,  55,  74,  74,   0,  84, -74,  29,  74,  55,  74,  74,   0,  84, -74,
+ 84,  55, -29, -84, -74,  74,  55, -29,  84,  55, -29, -84, -74,  74,  55, -29,
+ 29,  74,  55,  74,  74,   0,  84, -74,  29,  74,  55,  74,  74,   0,  84, -74,
+ 84,  55, -29, -84, -74,  74,  55, -29,  84,  55, -29, -84, -74,  74,  55, -29,
+};
+
+ALIGNED(32) const int16_t  fast_inverse_dct8_b4_coeff[64] = {
+ 84,  74,  74,   0,  55, -74,  29, -74,  84,  74,  74,   0,  55, -74,  29, -74,
+ 55,  29, -74, -74, -29,  84,  84, -55,  55,  29, -74, -74, -29,  84,  84, -55,
+ 84,  74,  74,   0,  55, -74,  29, -74,  84,  74,  74,   0,  55, -74,  29, -74,
+ 55,  29, -74, -74, -29,  84,  84, -55,  55,  29, -74, -74, -29,  84,  84, -55,
+};
+
+// Coeff arrays for forward B8
+ALIGNED(32) const int16_t  fast_forward_dct2_b8_coeff[128] = {
+ 64,  64,  89,  75,  83,  36,  75, -18,  64,  64,  89,  75,  83,  36,  75, -18,
+ 64,  64,  50,  18, -36, -83, -89, -50,  64,  64,  50,  18, -36, -83, -89, -50,
+ 64,  64, -18, -50, -83, -36,  50,  89,  64,  64, -18, -50, -83, -36,  50,  89,
+ 64,  64, -75, -89,  36,  83,  18, -75,  64,  64, -75, -89,  36,  83,  18, -75,
+ 64, -64,  50, -89,  36, -83,  18, -50,  64, -64,  50, -89,  36, -83,  18, -50,
+-64,  64,  18,  75,  83, -36,  75, -89, -64,  64,  18,  75,  83, -36,  75, -89,
+ 64, -64, -75, -18, -36,  83,  89, -75,  64, -64, -75, -18, -36,  83,  89, -75,
+-64,  64,  89, -50, -83,  36,  50, -18, -64,  64,  89, -50, -83,  36,  50, -18,
+};
+
+ALIGNED(32) const int16_t  fast_forward_dst7_b8_coeff[128] = {
+ 17,  32,  46,  78,  71,  85,  85,  46,  17,  32,  46,  78,  71,  85,  85,  46,
+ 46,  60,  86,  71,  32, -46, -60, -78,  46,  60,  86,  71,  32, -46, -60, -78,
+ 71,  78,  32, -17, -86, -60,  17,  86,  71,  78,  32, -17, -86, -60,  17,  86,
+ 85,  86, -60, -85,  17,  78,  32, -71,  85,  86, -60, -85,  17,  78,  32, -71,
+ 86, -17,  78, -71,  60, -86,  32, -60,  86, -17,  78, -71,  60, -86,  32, -60,
+-85,  32, -17,  85,  71, -17,  78, -86, -85,  32, -17,  85,  71, -17,  78, -86,
+ 78, -46, -60, -32, -46,  85,  85, -71,  78, -46, -60, -32, -46,  85,  85, -71,
+-71,  60,  86, -46, -78,  32,  46, -17, -71,  60,  86, -46, -78,  32,  46, -17,
+};
+
+ALIGNED(32) const int16_t  fast_forward_dct8_b8_coeff[128] = {
+ 86,  85,  85,  60,  78,  17,  71, -32,  86,  85,  85,  60,  78,  17,  71, -32,
+ 78,  71,  17, -32, -60, -86, -86, -17,  78,  71,  17, -32, -60, -86, -86, -17,
+ 60,  46, -71, -86, -46,  32,  78,  60,  60,  46, -71, -86, -46,  32,  78,  60,
+ 32,  17, -78, -46,  85,  71, -46, -85,  32,  17, -78, -46,  85,  71, -46, -85,
+ 60, -71,  46, -86,  32, -78,  17, -46,  60, -71,  46, -86,  32, -78,  17, -46,
+-46,  78,  32,  60,  85, -46,  71, -85, -46,  78,  32,  60,  85, -46,  71, -85,
+ 32, -85, -85,  17, -17,  71,  86, -78,  32, -85, -85,  17, -17,  71,  86, -78,
+-17,  86,  71, -78, -86,  60,  60, -32, -17,  86,  71, -78, -86,  60,  60, -32,
+};
+
+// Coeff arrays for inverse B8
+ALIGNED(32) const int16_t  fast_inverse_dct2_b8_coeff[128] = {
+ 64,  89,  64,  75,  64,  50,  64,  18,  64,  89,  64,  75,  64,  50,  64,  18,
+ 83,  75,  36, -18, -36, -89, -83, -50,  83,  75,  36, -18, -36, -89, -83, -50,
+ 64,  50, -64, -89, -64,  18,  64,  75,  64,  50, -64, -89, -64,  18,  64,  75,
+ 36,  18, -83, -50,  83,  75, -36, -89,  36,  18, -83, -50,  83,  75, -36, -89,
+ 64, -18,  64, -50,  64, -75,  64, -89,  64, -18,  64, -50,  64, -75,  64, -89,
+-83,  50, -36,  89,  36,  18,  83, -75, -83,  50, -36,  89,  36,  18,  83, -75,
+ 64, -75, -64, -18, -64,  89,  64, -50,  64, -75, -64, -18, -64,  89,  64, -50,
+-36,  89,  83, -75, -83,  50,  36, -18, -36,  89,  83, -75, -83,  50,  36, -18,
+};
+
+ALIGNED(32) const int16_t  fast_inverse_dst7_b8_coeff[128] = {
+ 17,  46,  32,  78,  46,  86,  60,  71,  17,  46,  32,  78,  46,  86,  60,  71,
+ 71,  85,  85,  46,  32, -60, -46, -78,  71,  85,  85,  46,  32, -60, -46, -78,
+ 86,  78, -17, -71, -85, -17,  32,  85,  86,  78, -17, -71, -85, -17,  32,  85,
+ 60,  32, -86, -60,  71,  78, -17, -86,  60,  32, -86, -60,  71,  78, -17, -86,
+ 71,  32,  78, -17,  85, -60,  86, -85,  71,  32,  78, -17,  85, -60,  86, -85,
+-86,  17, -60,  86,  17,  32,  78, -71, -86,  17, -60,  86,  17,  32,  78, -71,
+ 78, -60, -46, -32, -71,  86,  60, -46,  78, -60, -46, -32, -71,  86,  60, -46,
+-46,  85,  85, -71, -78,  46,  32, -17, -46,  85,  85, -71, -78,  46,  32, -17,
+};
+
+            const int16_t* fast_inverse_dct8_b8_coeff = fast_forward_dct8_b8_coeff; // The table used in forward transform works with inverse also.
+
+// Coeff arrays for forward B16
+ALIGNED(32) const int16_t  fast_forward_dct2_b16_coeff[256] = {
+ 64,  64,  90,  87,  89,  75,  87,  57,  64, -64,  57, -80,  50, -89,  43, -90,
+ 64,  64,  80,  70,  50,  18,   9, -43, -64,  64, -25,  90,  18,  75,  57,  25,
+ 64,  64,  57,  43, -18, -50, -80, -90,  64, -64,  -9, -87, -75, -18, -87,  70,
+ 64,  64,  25,   9, -75, -89, -70, -25, -64,  64,  43,  70,  89, -50,   9, -80,
+ 64,  64,  -9, -25, -89, -75,  25,  70,  64, -64, -70, -43, -50,  89,  80,  -9,
+ 64,  64, -43, -57, -50, -18,  90,  80, -64,  64,  87,   9, -18, -75, -70,  87,
+ 64,  64, -70, -80,  18,  50,  43,  -9,  64, -64, -90,  25,  75,  18, -25, -57,
+ 64,  64, -87, -90,  75,  89, -57, -87, -64,  64,  80, -57, -89,  50,  90, -43,
+ 83,  36,  80,   9,  75, -18,  70, -43,  36, -83,  25, -70,  18, -50,   9, -25,
+-36, -83, -70, -87, -89, -50, -87,   9,  83, -36,  90, -80,  75, -89,  43, -57,
+-83, -36, -25,  57,  50,  89,  90,  25, -36,  83,  43,   9,  89, -75,  70, -80,
+ 36,  83,  90,  43,  18, -75, -80, -57, -83,  36, -57,  87,  50, -18,  87, -90,
+ 83,  36, -43, -90, -75,  18,  57,  80,  36, -83, -87,  57, -18,  50,  90, -87,
+-36, -83, -57,  25,  89,  50, -25, -90,  83, -36,  -9, -43, -75,  89,  80, -70,
+-83, -36,  87,  70, -50, -89,  -9,  87, -36,  83,  80, -90, -89,  75,  57, -43,
+ 36,  83,  -9, -80, -18,  75,  43, -70, -83,  36,  70, -25, -50,  18,  25,  -9,
+};
+
+ALIGNED(32) const int16_t  fast_forward_dst7_b16_coeff[256] = {
+  8,  17,  25,  48,  40,  73,  55,  87,  88,  -8,  87, -40,  81, -68,  73, -85,  // 0
+ 25,  33,  68,  81,  88,  85,  81,  40, -88,  17, -68,  73, -25,  88,  25,  55,
+ 40,  48,  88,  88,  62,  25, -17, -68,  87, -25,  33, -88, -48, -48, -88,  48,
+ 55,  62,  81,  68, -17, -55, -88, -73, -85,  33,   8,  85,  88, -25,  33, -87,
+ 68,  73,  48,  25, -81, -88, -25,  33,  81, -40, -48, -62, -68,  81,  68,   8,
+ 77,  81,   0, -25, -77, -48,  77,  88, -77,  48,  77,  25,   0, -81, -77,  81,
+ 85,  87, -48, -68,  -8,  33,  62,   8,  73, -55, -88,  17,  68,  25, -17, -62,
+ 88,  88, -81, -88,  68,  87, -48, -85, -68,  62,  81, -55, -88,  48,  88, -40,
+ 68,  88,  77,  77,  85,  55,  88,  25,  62, -88,  48, -81,  33, -62,  17, -33,  // 8
+ 48, -25,   0, -77, -48, -87, -81, -48,  68,  -8,  88, -68,  81, -88,  48, -62,
+-81, -81, -77,   0,  -8,  81,  68,  68, -55,  88,  25,  25,  85, -68,  73, -81,
+-25,  48,  77,  77,  62, -40, -48, -81, -73,  17, -68,  88,  40,  -8,  87, -88,
+ 88,  68,   0, -77, -88, -17,  25,  88,  48, -87, -81,  48, -25,  55,  88, -85,
+  0, -68, -77,   0,  77,  68,   0, -88,  77, -25,   0, -48, -77,  88,  77, -68,
+-88, -48,  77,  77, -33, -88, -25,  81, -40,  85,  81, -88, -87,  73,  55, -40,
+ 25,  81,   0, -77, -25,  73,  48, -68, -81,  33,  68, -25, -48,  17,  25,  -8,
+};
+
+ALIGNED(32) const int16_t  fast_forward_dct8_b16_coeff[256] = {
+ 88,  88,  88,  81,  87,  68,  85,  48,  62, -68,  55, -81,  48, -88,  40, -88,  // 0
+ 87,  85,  68,  48,  33,  -8,  -8, -62, -55,  73, -17,  88,  25,  68,  62,  17,
+ 81,  77,  25,   0, -48, -77, -88, -77,  48, -77, -25, -77, -81,   0, -81,  77,
+ 73,  68, -25, -48, -88, -81, -33,  25, -40,  81,  62,  48,  81, -68,  -8, -68,
+ 62,  55, -68, -81, -55, -17,  73,  88,  33, -85, -85,  -8, -25,  88,  87, -33,
+ 48,  40, -88, -88,  25,  62,  68,  17, -25,  87,  88, -33, -48, -48, -48,  88,
+ 33,  25, -81, -68,  85,  88, -40, -81,  17, -88, -73,  68,  88, -25, -55, -25,
+ 17,   8, -48, -25,  73,  40, -87, -55,  -8,  88,  40, -87, -68,  81,  85, -73,
+ 81,  25,  77,   0,  73, -25,  68, -48,  33, -81,  25, -68,  17, -48,   8, -25,  // 8
+-48, -88, -77, -77, -88, -33, -81,  25,  85, -40,  88, -81,  73, -87,  40, -55,
+-68,   0,   0,  77,  68,  77,  88,   0, -25,  77,  48,   0,  88, -77,  68, -77,
+ 68,  88,  77,   0, -17, -88, -88, -25, -87,  48, -48,  81,  55, -25,  85, -88,
+ 48, -25, -77, -77, -40,  62,  81,  48,  17, -73, -88,  68,  -8,  40,  88, -87,
+-81, -81,   0,  77,  81,  -8, -68, -68,  88, -55, -25, -25, -68,  85,  81, -73,
+-25,  48,  77,   0, -87, -48,  48,  81,  -8,  68,  68, -88, -88,  81,  62, -48,
+ 88,  68, -77, -77,  55,  85, -25, -88, -88,  62,  81, -48, -62,  33,  33, -17,
+};
+
+// Coeff arrays for inverse B16
+ALIGNED(32) const int16_t  fast_inverse_dct2_b16_coeff[256] = {
+   64,  90,  64,  87,  64,  80,  64,  70,  64,  -9,  64, -25,  64, -43,  64, -57,
+   89,  87,  75,  57,  50,   9,  18, -43, -89,  25, -75,  70, -50,  90, -18,  80,
+   83,  80,  36,   9, -36, -70, -83, -87,  83, -43,  36, -90, -36, -57, -83,  25,
+   75,  70, -18, -43, -89, -87, -50,   9, -75,  57,  18,  80,  89, -25,  50, -90,
+   64,  57, -64, -80, -64, -25,  64,  90,  64, -70, -64, -43, -64,  87,  64,   9,
+   50,  43, -89, -90,  18,  57,  75,  25, -50,  80,  89,  -9, -18, -70, -75,  87,
+   36,  25, -83, -70,  83,  90, -36, -80,  36, -87, -83,  57,  83,  -9, -36, -43,
+   18,   9, -50, -25,  75,  43, -89, -57, -18,  90,  50, -87, -75,  80,  89, -70,
+   64,  57,  64,  43,  64,  25,  64,   9,  64, -70,  64, -80,  64, -87,  64, -90,
+  -18, -80, -50, -90, -75, -70, -89, -25,  18,  43,  50,  -9,  75, -57,  89, -87,
+  -83, -25, -36,  57,  36,  90,  83,  43, -83,  87, -36,  70,  36,  -9,  83, -80,
+   50,  90,  89,  25,  18, -80, -75, -57, -50,  -9, -89,  87, -18,  43,  75, -70,
+   64,  -9, -64, -87, -64,  43,  64,  70,  64, -90, -64,  25, -64,  80,  64, -57,
+  -75, -87, -18,  70,  89,   9, -50, -80,  75, -25,  18, -57, -89,  90,  50, -43,
+  -36,  43,  83,   9, -83, -57,  36,  87, -36,  80,  83, -90, -83,  70,  36, -25,
+   89,  70, -75, -80,  50,  87, -18, -90, -89,  57,  75, -43, -50,  25,  18,  -9,
+};
+
+ALIGNED(32) const int16_t  fast_inverse_dst7_b16_coeff[256] = {
+  8,  25,  17,  48,  25,  68,  33,  81,  68,  48,  73,  25,  77,   0,  81, -25,  // 0
+ 40,  55,  73,  87,  88,  81,  85,  40, -81, -25, -88,  33, -77,  77, -48,  88,
+ 68,  77,  88,  77,  48,   0, -25, -77,  88,   0,  68, -77,   0, -77, -68,   0,
+ 85,  88,  55,  25, -48, -81, -87, -48, -88,  25, -17,  88,  77,   0,  68, -88,
+ 88,  87,  -8, -40, -88, -68,  17,  73,  81, -48, -40, -62, -77,  77,  48,  25,
+ 81,  73, -68, -85, -25,  25,  88,  55, -68,  68,  81,   8,   0, -77, -81,  81,
+ 62,  48, -88, -81,  68,  88,  -8, -68,  48, -81, -87,  48,  77,   0, -25, -48,
+ 33,  17, -62, -33,  81,  48, -88, -62, -25,  88,  55, -85, -77,  77,  88, -68,
+ 40,  88,  48,  88,  55,  81,  62,  68,  85, -48,  87, -68,  88, -81,  88, -88,  // 8
+ 62, -17,  25, -68, -17, -88, -55, -73,  -8,  62,  33,   8,  68, -48,  87, -85,
+-81, -77, -81,   0, -25,  77,  48,  77, -88,  77, -48,  77,  25,   0,  81, -77,
+ -8,  68,  81,  68,  62, -48, -40, -81, -33, -25, -88,  81, -25,  48,  73, -68,
+ 87,  33, -25, -88, -85,   8,  33,  85,  73, -88, -55,  17, -68,  81,  62, -55,
+-48, -88, -48,  48,  88,  33, -25, -87,  68, -17,  25, -62, -88,  88,  48, -40,
+-55,  25,  88,  25, -73, -68,  17,  88, -40,  81,  85, -88, -81,  68,  33, -25,
+ 85,  73, -68, -81,  40,  87,  -8, -88, -87,  55,  73, -40, -48,  25,  17,  -8,
+};
+
+            const int16_t* fast_inverse_dct8_b16_coeff = fast_forward_dct8_b16_coeff;
+
+// Coeff arrays for forward B32
+ALIGNED(32) const int16_t  fast_forward_dct2_b32_coeff[1024] = {
+ 64,  64,  90,  90,  90,  87,  90,  82,  89,  75,  88,  67,  87,  57,  85,  46,  // 0
+ 83,  36,  82,  22,  80,   9,  78,  -4,  75, -18,  73, -31,  70, -43,  67, -54,
+ 64, -64,  61, -73,  57, -80,  54, -85,  50, -89,  46, -90,  43, -90,  38, -88,
+ 36, -83,  31, -78,  25, -70,  22, -61,  18, -50,  13, -38,   9, -25,   4, -13,
+ 64,  64,  88,  85,  80,  70,  67,  46,  50,  18,  31, -13,   9, -43, -13, -67,
+-36, -83, -54, -90, -70, -87, -82, -73, -89, -50, -90, -22, -87,   9, -78,  38,
+-64,  64, -46,  82, -25,  90,  -4,  88,  18,  75,  38,  54,  57,  25,  73,  -4,
+ 83, -36,  90, -61,  90, -80,  85, -90,  75, -89,  61, -78,  43, -57,  22, -31,
+ 64,  64,  82,  78,  57,  43,  22,  -4, -18, -50, -54, -82, -80, -90, -90, -73,  // 8
+-83, -36, -61,  13, -25,  57,  13,  85,  50,  89,  78,  67,  90,  25,  85, -22,
+ 64, -64,  31, -88,  -9, -87, -46, -61, -75, -18, -90,  31, -87,  70, -67,  90,
+-36,  83,   4,  54,  43,   9,  73, -38,  89, -75,  88, -90,  70, -80,  38, -46,
+ 64,  64,  73,  67,  25,   9, -31, -54, -75, -89, -90, -78, -70, -25, -22,  38,
+ 36,  83,  78,  85,  90,  43,  67, -22,  18, -75, -38, -90, -80, -57, -90,   4,
+-64,  64, -13,  90,  43,  70,  82,  13,  89, -50,  61, -88,   9, -80, -46, -31,
+-83,  36, -88,  82, -57,  87,  -4,  46,  50, -18,  85, -73,  87, -90,  54, -61,
+ 64,  64,  61,  54,  -9, -25, -73, -85, -89, -75, -46,  -4,  25,  70,  82,  88,  // 16
+ 83,  36,  31, -46, -43, -90, -88, -61, -75,  18, -13,  82,  57,  80,  90,  13,
+ 64, -64,  -4, -90, -70, -43, -90,  38, -50,  89,  22,  67,  80,  -9,  85, -78,
+ 36, -83, -38, -22, -87,  57, -78,  90, -18,  50,  54, -31,  90, -87,  67, -73,
+ 64,  64,  46,  38, -43, -57, -90, -88, -50, -18,  38,  73,  90,  80,  54,  -4,
+-36, -83, -90, -67, -57,  25,  31,  90,  89,  50,  61, -46, -25, -90, -88, -31,
+-64,  64,  22,  85,  87,   9,  67, -78, -18, -75, -85,  13, -70,  87,  13,  61,
+ 83, -36,  73, -90,  -9, -43, -82,  54, -75,  89,   4,  22,  80, -70,  78, -82,
+ 64,  64,  31,  22, -70, -80, -78, -61,  18,  50,  90,  85,  43,  -9, -61, -90,  // 24
+-83, -36,   4,  73,  87,  70,  54, -38, -50, -89, -88,  -4,  -9,  87,  82,  46,
+ 64, -64, -38, -78, -90,  25, -22,  90,  75,  18,  73, -82, -25, -57, -90,  54,
+-36,  83,  67, -13,  80, -90, -13, -31, -89,  75, -46,  67,  57, -43,  85, -88,
+ 64,  64,  13,   4, -87, -90, -38, -13,  75,  89,  61,  22, -57, -87, -78, -31,
+ 36,  83,  88,  38,  -9, -80, -90, -46, -18,  75,  85,  54,  43, -70, -73, -61,
+-64,  64,  54,  67,  80, -57, -31, -73, -89,  50,   4,  78,  90, -43,  22, -82,
+-83,  36, -46,  85,  70, -25,  67, -88, -50,  18, -82,  90,  25,  -9,  90, -90,
+ 64,  64,  -4, -13, -90, -87,  13,  38,  89,  75, -22, -61, -87, -57,  31,  78,  // 32
+ 83,  36, -38, -88, -80,  -9,  46,  90,  75, -18, -54, -85, -70,  43,  61,  73,
+ 64, -64, -67, -54, -57,  80,  73,  31,  50, -89, -78,  -4, -43,  90,  82, -22,
+ 36, -83, -85,  46, -25,  70,  88, -67,  18, -50, -90,  82,  -9,  25,  90, -90,
+ 64,  64, -22, -31, -80, -70,  61,  78,  50,  18, -85, -90,  -9,  43,  90,  61,
+-36, -83, -73,  -4,  70,  87,  38, -54, -89, -50,   4,  88,  87,  -9, -46, -82,
+-64,  64,  78,  38,  25, -90, -90,  22,  18,  75,  82, -73, -57, -25, -54,  90,
+ 83, -36,  13, -67, -90,  80,  31,  13,  75, -89, -67,  46, -43,  57,  88, -85,
+ 64,  64, -38, -46, -57, -43,  88,  90, -18, -50, -73, -38,  80,  90,   4, -54,  // 40
+-83, -36,  67,  90,  25, -57, -90, -31,  50,  89,  46, -61, -90, -25,  31,  88,
+ 64, -64, -85, -22,   9,  87,  78, -67, -75, -18, -13,  85,  87, -70, -61, -13,
+-36,  83,  90, -73, -43,  -9, -54,  82,  89, -75, -22,  -4, -70,  80,  82, -78,
+ 64,  64, -54, -61, -25,  -9,  85,  73, -75, -89,   4,  46,  70,  25, -88, -82,
+ 36,  83,  46, -31, -90, -43,  61,  88,  18, -75, -82,  13,  80,  57, -13, -90,
+-64,  64,  90,   4, -43, -70, -38,  90,  89, -50, -67, -22,  -9,  80,  78, -85,
+-83,  36,  22,  38,  57, -87, -90,  78,  50, -18,  31, -54, -87,  90,  73, -67,
+ 64,  64, -67, -73,   9,  25,  54,  31, -89, -75,  78,  90, -25, -70, -38,  22,  // 48
+ 83,  36, -85, -78,  43,  90,  22, -67, -75,  18,  90,  38, -57, -80,  -4,  90,
+ 64, -64, -90,  13,  70,  43, -13, -82, -50,  89,  88, -61, -80,   9,  31,  46,
+ 36, -83, -82,  88,  87, -57, -46,   4, -18,  50,  73, -85, -90,  87,  61, -54,
+ 64,  64, -78, -82,  43,  57,   4, -22, -50, -18,  82,  54, -90, -80,  73,  90,
+-36, -83, -13,  61,  57, -25, -85, -13,  89,  50, -67, -78,  25,  90,  22, -85,
+-64,  64,  88, -31, -87,  -9,  61,  46, -18, -75, -31,  90,  70, -87, -90,  67,
+ 83, -36, -54,  -4,   9,  43,  38, -73, -75,  89,  90, -88, -80,  70,  46, -38,
+ 64,  64, -85, -88,  70,  80, -46, -67,  18,  50,  13, -31, -43,   9,  67,  13,  // 56
+-83, -36,  90,  54, -87, -70,  73,  82, -50, -89,  22,  90,   9, -87, -38,  78,
+ 64, -64, -82,  46,  90, -25, -88,   4,  75,  18, -54, -38,  25,  57,   4, -73,
+-36,  83,  61, -90, -80,  90,  90, -85, -89,  75,  78, -61, -57,  43,  31, -22,
+ 64,  64, -90, -90,  87,  90, -82, -90,  75,  89, -67, -88,  57,  87, -46, -85,
+ 36,  83, -22, -82,   9,  80,   4, -78, -18,  75,  31, -73, -43,  70,  54, -67,
+-64,  64,  73, -61, -80,  57,  85, -54, -89,  50,  90, -46, -90,  43,  88, -38,
+-83,  36,  78, -31, -70,  25,  61, -22, -50,  18,  38, -13, -25,   9,  13,  -4,
+};
+
+ALIGNED(32) const int16_t  fast_forward_dst7_b32_coeff[1024] = {
+  4,   9,  13,  26,  21,  42,  30,  56,  38,  68,  46,  78,  53,  85,  60,  89,  // 0
+ 66,  90,  72,  86,  77,  80,  80,  72,  84,  60,  86,  46,  88,  30,  90,  13,
+ 90,  -4,  89, -21,  87, -38,  85, -53,  82, -66,  78, -77,  74, -84,  68, -88,
+ 63, -90,  56, -87,  50, -82,  42, -74,  34, -63,  26, -50,  17, -34,   9, -17,
+ 13,  17,  38,  50,  60,  74,  77,  87,  86,  88,  90,  77,  85,  53,  74,  21,
+ 56, -13,  34, -46,   9, -72, -17, -86, -42, -89, -63, -78, -78, -56, -87, -26,
+-90,   9, -84,  42, -72,  68, -53,  85, -30,  90,  -4,  80,  21,  60,  46,  30,
+ 66,  -4,  80, -38,  88, -66,  89, -84,  82, -90,  68, -82,  50, -63,  26, -34,
+ 21,  26,  60,  68,  84,  89,  89,  80,  74,  46,  42,  -4,   0, -53, -42, -84,  // 8
+-74, -87, -89, -63, -84, -17, -60,  34, -21,  74,  21,  90,  60,  77,  84,  38,
+ 89, -13,  74, -60,  42, -86,   0, -85, -42, -56, -74,  -9, -89,  42, -84,  78,
+-60,  90, -21,  72,  21,  30,  60, -21,  84, -66,  89, -88,  74, -82,  42, -50,
+ 30,  34,  77,  82,  89,  84,  63,  38,   9, -30, -50, -80, -85, -85, -84, -42,
+-46,  26,  13,  78,  66,  86,  90,  46,  74, -21,  26, -77, -34, -87, -78, -50,
+-88,  17, -60,  74,  -4,  88,  53,  53,  86, -13,  82, -72,  42, -89, -17, -56,
+-68,   9, -90,  68, -72,  90, -21,  60,  38,  -4,  80, -66,  87, -90,  56, -63,
+ 38,  42,  86,  89,  74,  60,   9, -21, -63, -84, -90, -74, -53,   0,  21,  74,  // 16
+ 80,  84,  82,  21,  26, -60, -50, -89, -89, -42, -66,  42,   4,  89,  72,  60,
+ 87, -21,  42, -84, -34, -74, -85,   0, -77,  74, -13,  84,  60,  21,  90, -60,
+ 56, -89, -17, -42, -78,  42, -84,  89, -30,  60,  46, -21,  88, -84,  68, -74,
+ 46,  50,  90,  88,  42,  21, -50, -72, -90, -78, -38,   9,  53,  85,  89,  60,
+ 34, -38, -56, -90, -88, -34, -30,  63,  60,  84,  87,   4,  26, -80, -63, -68,
+-86,  26, -21,  89,  66,  46,  85, -53,  17, -87, -68, -17, -84,  74, -13,  77,
+ 72, -13,  82, -86,   9, -56, -74,  42, -80,  90,  -4,  30,  77, -66,  78, -82,
+ 53,  56,  85,  80,   0, -21, -85, -90, -53, -17,  53,  82,  85,  53,   0, -60,  // 24
+-85, -78, -53,  26,  53,  90,  85,  13,   0, -84, -85, -50, -53,  63,  53,  77,
+ 85, -30,   0, -89, -85,  -9, -53,  85,  53,  46,  85, -66,   0, -74, -85,  34,
+-53,  88,  53,   4,  85, -86,   0, -42, -85,  68, -53,  72,  53, -38,  85, -87,
+ 60,  63,  74,  66, -42, -60, -84, -68,  21,  56,  89,  72,   0, -53, -89, -74,
+-21,  50,  84,  77,  42, -46, -74, -78, -60,  42,  60,  80,  74, -38, -42, -82,
+-84,  34,  21,  84,  89, -30,   0, -85, -89,  26, -21,  86,  84, -21,  42, -87,
+-74,  17, -60,  88,  60, -13,  74, -89, -42,   9, -84,  90,  21,  -4,  89, -90,
+ 66,  68,  56,  46, -74, -84, -46, -17,  80,  90,  34, -13, -85, -85, -21,  42,  // 32
+ 88,  72,   9, -66, -90, -50,   4,  82,  89,  21, -17, -90, -86,   9,  30,  86,
+ 82, -38, -42, -74, -77,  63,  53,  53,  68, -80, -63, -26, -60,  89,  72,  -4,
+ 50, -87, -78,  34, -38,  77,  84, -60,  26, -56, -87,  78, -13,  30,  90, -88,
+ 72,  74,  34,  21, -89, -89,  13,  42,  82,  60, -56, -84, -53,   0,  84,  84,
+  9, -60, -88, -42,  38,  89,  68, -21, -74, -74, -30,  74,  90,  21, -17, -89,
+-80,  42,  60,  60,  50, -84, -85,   0,  -4,  84,  87, -60, -42, -42, -66,  89,
+ 77, -21,  26, -74, -90,  74,  21,  21,  78, -89, -63,  42, -46,  60,  86, -84,
+ 77,  78,   9,  -4, -84, -74,  66,  82,  26, -13, -88, -68,  53,  85,  42, -21,  // 40
+-90, -63,  38,  87,  56, -30, -87, -56,  21,  89,  68, -38, -82, -50,   4,  90,
+ 78, -46, -74, -42, -13,  90,  85, -53, -63, -34, -30,  88,  89, -60, -50, -26,
+-46,  86,  90, -66, -34, -17, -60,  84,  86, -72, -17,  -9, -72,  80,  80, -77,
+ 80,  82, -17, -30, -60, -42,  90,  86, -50, -77, -30,  17,  85,  53, -74, -89,
+  4,  68,  68,  -4, -87, -63,  38,  90,  42, -60, -88,  -9,  66,  72,   9, -88,
+-77,  50,  84,  21, -26, -78, -53,  85,  90, -38, -56, -34, -21,  84,  82, -80,
+-78,  26,  13,  46,  63, -87, -89,  74,  46, -13,  34, -56, -86,  90,  72, -66,
+ 84,  85, -42, -53, -21,   0,  74,  53, -89, -85,  60,  85,   0, -53, -60,   0,  // 48
+ 89,  53, -74, -85,  21,  85,  42, -53, -84,   0,  84,  53, -42, -85, -21,  85,
+ 74, -53, -89,   0,  60,  53,   0, -85, -60,  85,  89, -53, -74,   0,  21,  53,
+ 42, -85, -84,  85,  84, -53, -42,   0, -21,  53,  74, -85, -89,  85,  60, -53,
+ 86,  87, -63, -72,  21,  42,  26,  -4, -66, -34,  87,  66, -85, -85,  60,  89,
+-17, -77, -30,  50,  68, -13, -88, -26,  84,  60, -56, -82,  13,  90,  34, -80,
+-72,  56,  89, -21, -82, -17,  53,  53,  -9, -78, -38,  90,  74, -84, -90,  63,
+ 80, -30, -50,  -9,   4,  46,  42, -74, -77,  88,  90, -86, -78,  68,  46, -38,
+ 88,  89, -78, -84,  60,  74, -34, -60,   4,  42,  26, -21, -53,   0,  74,  21,  // 56
+-86, -42,  90,  60, -82, -74,  66,  84, -42, -89,  13,  89,  17, -84, -46,  74,
+ 68, -60, -84,  42,  90, -21, -85,   0,  72,  21, -50, -42,  21,  60,   9, -74,
+-38,  84,  63, -89, -80,  89,  89, -84, -87,  74,  77, -60, -56,  42,  30, -21,
+ 90,  90, -87, -90,  84,  89, -78, -88,  72,  87, -63, -86,  53,  85, -42, -84,
+ 30,  82, -17, -80,   4,  78,   9, -77, -21,  74,  34, -72, -46,  68,  56, -66,
+-66,  63,  74, -60, -80,  56,  85, -53, -88,  50,  90, -46, -89,  42,  86, -38,
+-82,  34,  77, -30, -68,  26,  60, -21, -50,  17,  38, -13, -26,   9,  13,  -4,
+};
+
+ALIGNED(32) const int16_t  fast_forward_dct8_b32_coeff[1024] = {
+ 90,  90,  90,  87,  89,  84,  88,  78,  87,  72,  86,  63,  85,  53,  84,  42,  // 0
+ 82,  30,  80,  17,  78,   4,  77,  -9,  74, -21,  72, -34,  68, -46,  66, -56,
+ 63, -66,  60, -74,  56, -80,  53, -85,  50, -88,  46, -90,  42, -89,  38, -86,
+ 34, -82,  30, -77,  26, -68,  21, -60,  17, -50,  13, -38,   9, -26,   4, -13,
+ 89,  88,  84,  78,  74,  60,  60,  34,  42,   4,  21, -26,   0, -53, -21, -74,
+-42, -86, -60, -90, -74, -82, -84, -66, -89, -42, -89, -13, -84,  17, -74,  46,
+-60,  68, -42,  84, -21,  90,   0,  85,  21,  72,  42,  50,  60,  21,  74,  -9,
+ 84, -38,  89, -63,  89, -80,  84, -89,  74, -87,  60, -77,  42, -56,  21, -30,
+ 87,  86,  72,  63,  42,  21,   4, -26, -34, -66, -66, -87, -85, -85, -89, -60,  // 8
+-77, -17, -50,  30, -13,  68,  26,  88,  60,  84,  82,  56,  90,  13,  80, -34,
+ 56, -72,  21, -89, -17, -82, -53, -53, -78,  -9, -90,  38, -84,  74, -63,  90,
+-30,  80,   9,  50,  46,   4,  74, -42,  88, -77,  86, -90,  68, -78,  38, -46,
+ 85,  84,  53,  42,   0, -21, -53, -74, -85, -89, -85, -60, -53,   0,   0,  60,
+ 53,  89,  85,  74,  85,  21,  53, -42,   0, -84, -53, -84, -85, -42, -85,  21,
+-53,  74,   0,  89,  53,  60,  85,   0,  85, -60,  53, -89,   0, -74, -53, -21,
+-85,  42, -85,  84, -53,  84,   0,  42,  53, -21,  85, -74,  85, -89,  53, -60,
+ 82,  80,  30,  17, -42, -60, -86, -90, -77, -50, -17,  30,  53,  85,  89,  74,  // 16
+ 68,   4,   4, -68, -63, -87, -90, -38, -60,  42,   9,  88,  72,  66,  88,  -9,
+ 50, -77, -21, -84, -78, -26, -85,  53, -38,  90,  34,  56,  84, -21,  80, -82,
+ 26, -78, -46, -13, -87,  63, -74,  89, -13,  46,  56, -34,  90, -86,  66, -72,
+ 78,  77,   4,  -9, -74, -84, -82, -66, -13,  26,  68,  88,  85,  53,  21, -42,
+-63, -90, -87, -38, -30,  56,  56,  87,  89,  21,  38, -68, -50, -82, -90,  -4,
+-46,  78,  42,  74,  90, -13,  53, -85, -34, -63, -88,  30, -60,  89,  26,  50,
+ 86, -46,  66, -90, -17, -34, -84,  60, -72,  86,   9,  17,  80, -72,  77, -80,
+ 74,  72, -21, -34, -89, -89, -42, -13,  60,  82,  84,  56,   0, -53, -84, -84,  // 24
+-60,   9,  42,  88,  89,  38,  21, -68, -74, -74, -74,  30,  21,  90,  89,  17,
+ 42, -80, -60, -60, -84,  50,   0,  85,  84,  -4,  60, -87, -42, -42, -89,  66,
+-21,  77,  74, -26,  74, -90, -21, -21, -89,  78, -42,  63,  60, -46,  84, -86,
+ 68,  66, -46, -56, -84, -74,  17,  46,  90,  80,  13, -34, -85, -85, -42,  21,
+ 72,  88,  66,  -9, -50, -90, -82,  -4,  21,  89,  90,  17,   9, -86, -86, -30,
+-38,  82,  74,  42,  63, -77, -53, -53, -80,  68,  26,  63,  89, -60,   4, -72,
+-87,  50, -34,  78,  77, -38,  60, -84, -56,  26, -78,  87,  30, -13,  88, -90,
+ 63,  60, -66, -74, -60, -42,  68,  84,  56,  21, -72, -89, -53,   0,  74,  89,  // 32
+ 50, -21, -77, -84, -46,  42,  78,  74,  42, -60, -80, -60, -38,  74,  82,  42,
+ 34, -84, -84, -21, -30,  89,  85,   0,  26, -89, -86,  21, -21,  84,  87, -42,
+ 17, -74, -88,  60, -13,  60,  89, -74,   9, -42, -90,  84,  -4,  21,  90, -89,
+ 56,  53, -80, -85, -21,   0,  90,  85, -17, -53, -82, -53,  53,  85,  60,   0,
+-78, -85, -26,  53,  90,  53, -13, -85, -84,   0,  50,  85,  63, -53, -77, -53,
+-30,  85,  89,   0,  -9, -85, -85,  53,  46,  53,  66, -85, -74,   0, -34,  85,
+ 88, -53,  -4, -53, -86,  85,  42,   0,  68, -85, -72,  53, -38,  53,  87, -85,
+ 50,  46, -88, -90,  21,  42,  72,  50, -78, -90,  -9,  38,  85,  53, -60, -89,  // 40
+-38,  34,  90,  56, -34, -88, -63,  30,  84,  60,  -4, -87, -80,  26,  68,  63,
+ 26, -86, -89,  21,  46,  66,  53, -85, -87,  17,  17,  68,  74, -84, -77,  13,
+-13,  72,  86, -82, -56,   9, -42,  74,  90, -80, -30,   4, -66,  77,  82, -78,
+ 42,  38, -89, -86,  60,  74,  21,  -9, -84, -63,  74,  90,   0, -53, -74, -21,
+ 84,  80, -21, -82, -60,  26,  89,  50, -42, -89, -42,  66,  89,   4, -60, -72,
+-21,  87,  84, -42, -74, -34,   0,  85,  74, -77, -84,  13,  21,  60,  60, -90,
+-89,  56,  42,  17,  42, -78, -89,  84,  60, -30,  21, -46, -84,  88,  74, -68,
+ 34,  30, -82, -77,  84,  89, -38, -63, -30,   9,  80,  50, -85, -85,  42,  84,  // 48
+ 26, -46, -78, -13,  86,  66, -46, -90, -21,  74,  77, -26, -87, -34,  50,  78,
+ 17, -88, -74,  60,  88,  -4, -53, -53, -13,  86,  72, -82, -89,  42,  56,  17,
+  9, -68, -68,  90,  90, -72, -60,  21,  -4,  38,  66, -80, -90,  87,  63, -56,
+ 26,  21, -68, -60,  89,  84, -80, -89,  46,  74,   4, -42, -53,   0,  84,  42,
+-87, -74,  63,  89, -17, -84, -34,  60,  74, -21, -90, -21,  77,  60, -38, -84,
+-13,  89,  60, -74, -86,  42,  85,   0, -56, -42,   9,  74,  42, -89, -78,  84,
+ 90, -60, -72,  21,  30,  21,  21, -60, -66,  84,  88, -89, -82,  74,  50, -42,
+ 17,  13, -50, -38,  74,  60, -87, -77,  88,  86, -77, -90,  53,  85, -21, -74,  // 56
+-13,  56,  46, -34, -72,   9,  86,  17, -89, -42,  78,  63, -56, -78,  26,  87,
+  9, -90, -42,  84,  68, -72, -85,  53,  90, -30, -80,   4,  60,  21, -30, -46,
+ -4,  66,  38, -80, -66,  88,  84, -89, -90,  82,  82, -68, -63,  50,  34, -26,
+  9,   4, -26, -13,  42,  21, -56, -30,  68,  38, -78, -46,  85,  53, -89, -60,
+ 90,  66, -86, -72,  80,  77, -72, -80,  60,  84, -46, -86,  30,  88, -13, -90,
+ -4,  90,  21, -89, -38,  87,  53, -85, -66,  82,  77, -78, -84,  74,  88, -68,
+-90,  63,  87, -56, -82,  50,  74, -42, -63,  34,  50, -26, -34,  17,  17,  -9,
+};
+
+// Coeff arrays for inverse B32
+ALIGNED(32) const int16_t  fast_inverse_dct2_b32_coeff[1024] = {
+   64,  90,  64,  90,  64,  88,  64,  85,  64,  82,  64,  78,  64,  73,  64,  67,  // 0
+ 64,  61,  64,  54,  64,  46,  64,  38,  64,  31,  64,  22,  64,  13,  64,   4,
+ 64,  -4,  64, -13,  64, -22,  64, -31,  64, -38,  64, -46,  64, -54,  64, -61,
+ 64, -67,  64, -73,  64, -78,  64, -82,  64, -85,  64, -88,  64, -90,  64, -90,
+ 90,  90,  87,  82,  80,  67,  70,  46,  57,  22,  43,  -4,  25, -31,   9, -54,
+ -9, -73, -25, -85, -43, -90, -57, -88, -70, -78, -80, -61, -87, -38, -90, -13,
+-90,  13, -87,  38, -80,  61, -70,  78, -57,  88, -43,  90, -25,  85,  -9,  73,
+  9,  54,  25,  31,  43,   4,  57, -22,  70, -46,  80, -67,  87, -82,  90, -90,
+ 89,  88,  75,  67,  50,  31,  18, -13, -18, -54, -50, -82, -75, -90, -89, -78,  // 8
+-89, -46, -75,  -4, -50,  38, -18,  73,  18,  90,  50,  85,  75,  61,  89,  22,
+ 89, -22,  75, -61,  50, -85,  18, -90, -18, -73, -50, -38, -75,   4, -89,  46,
+-89,  78, -75,  90, -50,  82, -18,  54,  18,  13,  50, -31,  75, -67,  89, -88,
+ 87,  85,  57,  46,   9, -13, -43, -67, -80, -90, -90, -73, -70, -22, -25,  38,
+ 25,  82,  70,  88,  90,  54,  80,  -4,  43, -61,  -9, -90, -57, -78, -87, -31,
+-87,  31, -57,  78,  -9,  90,  43,  61,  80,   4,  90, -54,  70, -88,  25, -82,
+-25, -38, -70,  22, -90,  73, -80,  90, -43,  67,   9,  13,  57, -46,  87, -85,
+ 83,  82,  36,  22, -36, -54, -83, -90, -83, -61, -36,  13,  36,  78,  83,  85,  // 16
+ 83,  31,  36, -46, -36, -90, -83, -67, -83,   4, -36,  73,  36,  88,  83,  38,
+ 83, -38,  36, -88, -36, -73, -83,  -4, -83,  67, -36,  90,  36,  46,  83, -31,
+ 83, -85,  36, -78, -36, -13, -83,  61, -83,  90, -36,  54,  36, -22,  83, -82,
+ 80,  78,   9,  -4, -70, -82, -87, -73, -25,  13,  57,  85,  90,  67,  43, -22,
+-43, -88, -90, -61, -57,  31,  25,  90,  87,  54,  70, -38,  -9, -90, -80, -46,
+-80,  46,  -9,  90,  70,  38,  87, -54,  25, -90, -57, -31, -90,  61, -43,  88,
+ 43,  22,  90, -67,  57, -85, -25, -13, -87,  73, -70,  82,   9,   4,  80, -78,
+ 75,  73, -18, -31, -89, -90, -50, -22,  50,  78,  89,  67,  18, -38, -75, -90,  // 24
+-75, -13,  18,  82,  89,  61,  50, -46, -50, -88, -89,  -4, -18,  85,  75,  54,
+ 75, -54, -18, -85, -89,   4, -50,  88,  50,  46,  89, -61,  18, -82, -75,  13,
+-75,  90,  18,  38,  89, -67,  50, -78, -50,  22, -89,  90, -18,  31,  75, -73,
+ 70,  67, -43, -54, -87, -78,   9,  38,  90,  85,  25, -22, -80, -90, -57,   4,
+ 57,  90,  80,  13, -25, -88, -90, -31,  -9,  82,  87,  46,  43, -73, -70, -61,
+-70,  61,  43,  73,  87, -46,  -9, -82, -90,  31, -25,  88,  80, -13,  57, -90,
+-57,  -4, -80,  90,  25,  22,  90, -85,   9, -38, -87,  78, -43,  54,  70, -67,
+ 64,  61, -64, -73, -64, -46,  64,  82,  64,  31, -64, -88, -64, -13,  64,  90,  // 32
+ 64,  -4, -64, -90, -64,  22,  64,  85,  64, -38, -64, -78, -64,  54,  64,  67,
+ 64, -67, -64, -54, -64,  78,  64,  38,  64, -85, -64, -22, -64,  90,  64,   4,
+ 64, -90, -64,  13, -64,  88,  64, -31,  64, -82, -64,  46, -64,  73,  64, -61,
+ 57,  54, -80, -85, -25,  -4,  90,  88,  -9, -46, -87, -61,  43,  82,  70,  13,
+-70, -90, -43,  38,  87,  67,   9, -78, -90, -22,  25,  90,  80, -31, -57, -73,
+-57,  73,  80,  31,  25, -90, -90,  22,   9,  78,  87, -67, -43, -38, -70,  90,
+ 70, -13,  43, -82, -87,  61,  -9,  46,  90, -88, -25,   4, -80,  85,  57, -54,
+ 50,  46, -89, -90,  18,  38,  75,  54, -75, -90, -18,  31,  89,  61, -50, -88,  // 40
+-50,  22,  89,  67, -18, -85, -75,  13,  75,  73,  18, -82, -89,   4,  50,  78,
+ 50, -78, -89,  -4,  18,  82,  75, -73, -75, -13, -18,  85,  89, -67, -50, -22,
+-50,  88,  89, -61, -18, -31, -75,  90,  75, -54,  18, -38, -89,  90,  50, -46,
+ 43,  38, -90, -88,  57,  73,  25,  -4, -87, -67,  70,  90,   9, -46, -80, -31,
+ 80,  85,  -9, -78, -70,  13,  87,  61, -25, -90, -57,  54,  90,  22, -43, -82,
+-43,  82,  90, -22, -57, -54, -25,  90,  87, -61, -70, -13,  -9,  78,  80, -85,
+-80,  31,   9,  46,  70, -90, -87,  67,  25,   4,  57, -73, -90,  88,  43, -38,
+ 36,  31, -83, -78,  83,  90, -36, -61, -36,   4,  83,  54, -83, -88,  36,  82,  // 48
+ 36, -38, -83, -22,  83,  73, -36, -90, -36,  67,  83, -13, -83, -46,  36,  85,
+ 36, -85, -83,  46,  83,  13, -36, -67, -36,  90,  83, -73, -83,  22,  36,  38,
+ 36, -82, -83,  88,  83, -54, -36,  -4, -36,  61,  83, -90, -83,  78,  36, -31,
+ 25,  22, -70, -61,  90,  85, -80, -90,  43,  73,   9, -38, -57,  -4,  87,  46,
+-87, -78,  57,  90,  -9, -82, -43,  54,  80, -13, -90, -31,  70,  67, -25, -88,
+-25,  88,  70, -67, -90,  31,  80,  13, -43, -54,  -9,  82,  57, -90, -87,  78,
+ 87, -46, -57,   4,   9,  38,  43, -73, -80,  90,  90, -85, -70,  61,  25, -22,
+ 18,  13, -50, -38,  75,  61, -89, -78,  89,  88, -75, -90,  50,  85, -18, -73,  // 56
+-18,  54,  50, -31, -75,   4,  89,  22, -89, -46,  75,  67, -50, -82,  18,  90,
+ 18, -90, -50,  82,  75, -67, -89,  46,  89, -22, -75,  -4,  50,  31, -18, -54,
+-18,  73,  50, -85, -75,  90,  89, -88, -89,  78,  75, -61, -50,  38,  18, -13,
+  9,   4, -25, -13,  43,  22, -57, -31,  70,  38, -80, -46,  87,  54, -90, -61,
+ 90,  67, -87, -73,  80,  78, -70, -82,  57,  85, -43, -88,  25,  90,  -9, -90,
+ -9,  90,  25, -90, -43,  88,  57, -85, -70,  82,  80, -78, -87,  73,  90, -67,
+-90,  61,  87, -54, -80,  46,  70, -38, -57,  31,  43, -22, -25,  13,   9,  -4,
+};
+
+ALIGNED(32) const int16_t  fast_inverse_dst7_b32_coeff[1024] = {
+  4,  13,   9,  26,  13,  38,  17,  50,  21,  60,  26,  68,  30,  77,  34,  82,  // 0
+ 38,  86,  42,  89,  46,  90,  50,  88,  53,  85,  56,  80,  60,  74,  63,  66,
+ 66,  56,  68,  46,  72,  34,  74,  21,  77,   9,  78,  -4,  80, -17,  82, -30,
+ 84, -42,  85, -53,  86, -63,  87, -72,  88, -78,  89, -84,  90, -87,  90, -90,
+ 21,  30,  42,  56,  60,  77,  74,  87,  84,  89,  89,  80,  89,  63,  84,  38,
+ 74,   9,  60, -21,  42, -50,  21, -72,   0, -85, -21, -90, -42, -84, -60, -68,
+-74, -46, -84, -17, -89,  13, -89,  42, -84,  66, -74,  82, -60,  90, -42,  86,
+-21,  74,   0,  53,  21,  26,  42,  -4,  60, -34,  74, -60,  84, -78,  89, -88,
+ 38,  46,  68,  78,  86,  90,  88,  77,  74,  42,  46,  -4,   9, -50, -30, -80,  // 8
+-63, -90, -84, -74, -90, -38, -78,   9, -53,  53, -17,  82,  21,  89,  56,  72,
+ 80,  34,  90, -13,  82, -56,  60, -84,  26, -88, -13, -68, -50, -30, -77,  17,
+-89,  60, -85,  85, -66,  87, -34,  66,   4,  26,  42, -21,  72, -63,  87, -86,
+ 53,  60,  85,  89,  85,  74,  53,  21,   0, -42, -53, -84, -85, -84, -85, -42,
+-53,  21,   0,  74,  53,  89,  85,  60,  85,   0,  53, -60,   0, -89, -53, -74,
+-85, -21, -85,  42, -53,  84,   0,  84,  53,  42,  85, -21,  85, -74,  53, -89,
+  0, -60, -53,   0, -85,  60, -85,  89, -53,  74,   0,  21,  53, -42,  85, -84,
+ 66,  72,  90,  86,  56,  34, -13, -46, -74, -89, -87, -63, -46,  13,  26,  78,  // 16
+ 80,  82,  84,  21,  34, -56, -38, -90, -85, -53, -78,  26, -21,  84,  50,  77,
+ 88,   9,  72, -66,   9, -88, -60, -42, -90,  38, -63,  87,   4,  68,  68,  -4,
+ 89, -74,  53, -85, -17, -30, -77,  50, -86,  90, -42,  60,  30, -17,  82, -80,
+ 77,  80,  80,  72,   9, -17, -72, -86, -84, -60, -17,  34,  66,  90,  86,  46,
+ 26, -50, -60, -89, -88, -30, -34,  63,  53,  85,  90,  13,  42, -74, -46, -78,
+-90,   4, -50,  82,  38,  68,  89, -21,  56, -87, -30, -56, -87,  38, -63,  90,
+ 21,  42,  85, -53,  68, -88, -13, -26, -82,  66, -74,  84,   4,   9,  78, -77,
+ 84,  86,  60,  46, -42, -63, -89, -78, -21,  21,  74,  90,  74,  26, -21, -77,  // 24
+-89, -66, -42,  42,  60,  87,  84,   4,   0, -85, -84, -50, -60,  60,  42,  80,
+ 89, -17,  21, -90, -74, -30, -74,  74,  21,  68,  89, -38,  42, -88, -60,  -9,
+-84,  84,   0,  53,  84, -56,  60, -82, -42,  13, -89,  89, -21,  34,  74, -72,
+ 88,  90,  30,  13, -78, -87, -56, -26,  60,  84,  77,  38, -34, -78, -87, -50,
+  4,  72,  89,  60,  26, -63, -80, -68, -53,  53,  63,  77,  74, -42, -38, -82,
+-86,  30,   9,  86,  90, -17,  21, -89, -82,   4, -50,  90,  66,   9,  72, -88,
+-42, -21, -85,  85,  13,  34,  90, -80,  17, -46, -84,  74, -46,  56,  68, -66,
+ 90,  89,  -4, -21, -90, -84,   9,  42,  89,  74, -13, -60, -88, -60,  17,  74,  // 32
+ 87,  42, -21, -84, -86, -21,  26,  89,  85,   0, -30, -89, -84,  21,  34,  84,
+ 82, -42, -38, -74, -80,  60,  42,  60,  78, -74, -46, -42, -77,  84,  50,  21,
+ 74, -89, -53,   0, -72,  89,  56, -21,  68, -84, -60,  42, -66,  74,  63, -60,
+ 87,  85, -38, -53, -72, -53,  68,  85,  42,   0, -86, -85,  -4,  53,  88,  53,
+-34, -85, -74,   0,  66,  85,  46, -53, -85, -53,  -9,  85,  89,   0, -30, -85,
+-77,  53,  63,  53,  50, -85, -84,   0, -13,  85,  90, -53, -26, -53, -78,  85,
+ 60,   0,  53, -85, -82,  53, -17,  53,  90, -85, -21,   0, -80,  85,  56, -53,
+ 82,  78, -66, -77, -30,  -4,  90,  80, -42, -74, -56,  -9,  86,  82, -13, -72,  // 40
+-77, -13,  74,  84,  17, -68, -87, -17,  53,  85,  46, -66, -89, -21,  26,  86,
+ 68, -63, -80, -26,  -4,  87,  84, -60, -63, -30, -34,  88,  90, -56, -38, -34,
+-60,  89,  85, -53,  -9, -38, -78,  90,  72, -50,  21, -42, -88,  90,  50, -46,
+ 74,  68, -84, -88,  21,  46,  60,  30, -89, -84,  42,  78,  42, -17, -89, -56,
+ 60,  90,  21, -60, -84, -13,  74,  77,   0, -85, -74,  34,  84,  42, -21, -87,
+-60,  72,  89,  -4, -42, -66, -42,  89,  89, -50, -60, -26, -21,  82,  84, -80,
+-74,  21,   0,  53,  74, -90, -84,  63,  21,   9,  60, -74, -89,  86,  42, -38,
+ 63,  56, -90, -87,  66,  80,  -4, -38, -60, -21,  90,  72, -68, -90,   9,  68,  // 48
+ 56, -17, -89, -42,  72,  82, -13, -86, -53,  53,  88,   4, -74, -60,  17,  88,
+ 50, -78, -87,  34,  77,  26, -21, -74, -46,  90,  86, -66, -78,  13,  26,  46,
+ 42, -84, -85,  85,  80, -50, -30,  -9, -38,  63,  84, -89, -82,  77,  34, -30,
+ 50,  42, -82, -74,  88,  89, -66, -84,  21,  60,  30, -21, -72, -21,  90,  60,
+-78, -84,  42,  89,   9, -74, -56,  42,  85,   0, -86, -42,  60,  74, -13, -89,
+-38,  84,  77, -60, -90,  21,  74,  21, -34, -60, -17,  84,  63, -89, -87,  74,
+ 84, -42, -53,   0,   4,  42,  46, -74, -80,  89,  89, -84, -68,  60,  26, -21,
+ 34,  26, -63, -50,  82,  68, -90, -82,  84,  89, -66, -88,  38,  80,  -4, -66,  // 56
+-30,  46,  60, -21, -80,  -4,  90,  30, -85, -53,  68,  72, -42, -84,   9,  90,
+ 26, -87, -56,  78,  78, -63, -89,  42,  86, -17, -72,  -9,  46,  34, -13, -56,
+-21,  74,  53, -85, -77,  90,  88, -86, -87,  77,  74, -60, -50,  38,  17, -13,
+ 17,   9, -34, -17,  50,  26, -63, -34,  74,  42, -82, -50,  87,  56, -90, -63,
+ 88,  68, -84, -74,  77,  78, -66, -82,  53,  85, -38, -87,  21,  89,  -4, -90,
+-13,  90,  30, -88, -46,  86,  60, -84, -72,  80,  80, -77, -86,  72,  90, -66,
+-89,  60,  85, -53, -78,  46,  68, -38, -56,  30,  42, -21, -26,  13,   9,  -4,
+};
+
+            const int16_t* fast_inverse_dct8_b32_coeff = fast_forward_dct8_b32_coeff;
+
+
+// Shuffle tables for advanced and optimized avx2 functions
+
+// Shuffle 16 bit samples inside lanes. Put each sample four spaces from each other adjacent to each other.
+// _mm256_shuffle_epi8
+// Input  [0 1 2 3 4 5 6 7 | XX
+// Output [0 4 1 5 2 6 3 7 | XX
+ALIGNED(32) const int8_t shuffle_16b_0415[32] = {
+  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+};
+
+// Shuffle 16 bit samples inside lanes. Put each even indexed sample next to each other, then each odd sample.
+// _mm256_shuffle_epi8
+// Input  [0 1 2 3 4 5 6 7 |
+// Output [0 2 4 6 1 3 5 7 |
+ALIGNED(32) const int8_t shuffle_16b_0246[32] = {
+  0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
+  0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
+};
+
+// Permute 32 bit samples across lanes. Put each sample four spaces from each other adjacent to each other.
+// _mm256_permutevar8x32_epi32
+// Input  [0 1 2 3 | 4 5 6 7]
+// Output [0 1 4 5 | 2 6 3 7]
+ALIGNED(32) const int32_t permute_32b_0415[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
+
+
+            const int8_t* fi_tr_2x8_shuffle_hor = shuffle_16b_0415;
+
+ALIGNED(32) const int8_t  fi_tr_2x8_result_shuffle1_ver[32] = {
+ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
+ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
+};
+
+ALIGNED(32) const int8_t  ff_dct2_2x8_shuffle_ver[32] = {
+   0,  1,  4,  5,  2,  3,  6,  7,  8,  9, 12, 13, 10, 11, 14, 15,
+  16, 17, 20, 21, 18, 19, 22, 23, 24, 25, 28, 29, 26, 27, 30, 31
+};
+
+ALIGNED(32) const int8_t  ff_dct2_2x8_result_shuffle_ver[32] = {
+   0,  1,  4,  5,  2,  3,  6,  7,  8,  9, 12, 13, 10, 11, 14, 15,
+  16, 17, 20, 21, 18, 19, 22, 23, 24, 25, 28, 29, 26, 27, 30, 31
+};
+
+ALIGNED(32) const int8_t  fi_tr_2x8_result_shuffle2_ver[32] = {
+ 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+ 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+};
+
+ALIGNED(32) const int8_t  ff_dct2_2x16_ver_result_shuffle[32] = {
+   0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15,
+   0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15,
+};
+
+ALIGNED(32) const int8_t  fi_tr_4x4_shuffle_hor[32] = {
+  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+};
+
+ALIGNED(32) const int8_t  fi_tr_4x4_result_shuffle_ver[32] = {
+  0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
+  0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
+};
+
+ALIGNED(32) const int8_t  fi_tr_4x8_result_shuffle_ver[32] = {
+  0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
+  0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
+};
+
+ALIGNED(32) const int8_t  ff_dct2_8x2_ver_pass_shuffle[32] = {
+   0,  1,  8,  9,  2,  3, 10, 11, 4,  5, 12, 13,  6,  7, 14, 15,
+   0,  1,  8,  9,  2,  3, 10, 11, 4,  5, 12, 13,  6,  7, 14, 15
+};
+
+ALIGNED(32) const int8_t  fi_tr_8x2_shuffle_hor[32] = {
+  0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
+  0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
+};
+
+ALIGNED(32) const int8_t  fi_tr_8x2_shuffle_ver[32] = {
+  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+};
+
+            const int8_t* fi_tr_8x2_res_shuffle_ver = shuffle_16b_0415;
+
+ALIGNED(32) const int8_t  ff_dct2_8x4_ver_pass_shuffle[32] = {
+   0,  1,  8,  9,  4,  5, 12, 13, 2,  3, 10, 11,  6,  7, 14, 15,
+   0,  1,  8,  9,  4,  5, 12, 13, 2,  3, 10, 11,  6,  7, 14, 15,
+};
+
+// TODO: remove duplicate tables. Rename with a more descriptive name.
+ALIGNED(32) const int8_t  ff_dct2_8x4_ver_pass_result_shuffle[32] = {
+   0,  1,  4,  5,  2,  3,  6,  7, 8,  9, 12, 13, 10, 11, 14, 15,
+   0,  1,  4,  5,  2,  3,  6,  7, 8,  9, 12, 13, 10, 11, 14, 15,
+};
+
+ALIGNED(32) const int8_t  ff_dct2_8x16_butterfly_shuffle[32] = {
+   0,  1, 14, 15,  2,  3, 12, 13,  4,  5, 10, 11,  6,  7,  8,  9,
+  16, 17, 30, 31, 18, 19, 28, 29, 20, 21, 26, 27, 22, 23, 24, 25
+};
+
+ALIGNED(32) const int8_t  ff_dct2_8x16_butterfly_shuffle_order[32] = {
+   0,  1,  4,  5,  2,  3,  6,  7,  8,  9, 12, 13, 10, 11, 14, 15,
+  16, 17, 20, 21, 18, 19, 22, 23, 24, 25, 28, 29, 26, 27, 30, 31
+};
+
+// Arrange samples into butterfly formation
+ALIGNED(32) const int8_t  ff_dct2_16x8_butterfly_shuffle[32] = {
+   0,  1, 14, 15,  2,  3, 12, 13,  4,  5, 10, 11,  6,  7,  8,  9,
+  16, 17, 30, 31, 18, 19, 28, 29, 20, 21, 26, 27, 22, 23, 24, 25
+};
+
+// Swap two middle 16-bit values in each 64-bit chunk
+ALIGNED(32) const int8_t  ff_dct2_16x8_butterfly_res_shuffle_ver[32] = {
+   0,  1,  4,  5,  2,  3,  6,  7,  8,  9, 12, 13, 10, 11, 14, 15,
+  16, 17, 20, 21, 18, 19, 22, 23, 24, 25, 28, 29, 26, 27, 30, 31
+};
+
+ALIGNED(32) const int8_t  ff_dct2_16x32_reverse_64b_order[32] = {
+  6,  7,  4,  5,  2,  3,  0,  1,  14, 15, 12, 13, 10, 11,  8,  9,
+  22, 23, 20, 21, 18, 19, 16, 17, 30, 31, 28, 29, 26, 27, 24, 25,
+};
+
+ALIGNED(32) const int8_t  ff_dct2_32x2_butterfly_order_shuffle[32] = {
+  14, 15, 12, 13, 10, 11,  8,  9,  6,  7,  4,  5,  2,  3,  0,  1,
+  30, 31, 28, 29, 26, 27, 24, 25, 22, 23, 20, 21, 18, 19, 16, 17
+};
+
+ALIGNED(32) const int8_t  ff_dct2_32x8_shuffle_order[32] = {
+   0,  1, 14, 15,  2,  3, 12, 13,  4,  5, 10, 11,  6,  7,  8,  9,
+  16, 17, 30, 31, 18, 19, 28, 29, 20, 21, 26, 27, 22, 23, 24, 25
+};
+
+ALIGNED(32) const int8_t  ff_dct2_32x8_shuffle_result[32] = {
+   0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15,
+  16, 17, 24, 25, 18, 19, 26, 27, 20, 21, 28, 29, 22, 23, 30, 31
+};
+
+
+// Coeff tables for advanced and optimized avx2 functions
+
+// 2xN
+ALIGNED(32) const int16_t  ff_dct2_2xN_coeff_hor[32] = {
+ 64,  64, 64,  64, 64,  64, 64,  64, 64,  64, 64,  64, 64,  64, 64,  64,
+ 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64
+};
+
+ALIGNED(32) const int16_t  ff_dct2_2x8_coeff_ver[128] = {
+ 64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,
+ 89,  75,  50,  18, -18, -50, -75, -89,  89,  75,  50,  18, -18, -50, -75, -89,
+ 83,  36, -36, -83, -83, -36,  36,  83,  83,  36, -36, -83, -83, -36,  36,  83,
+ 75, -18, -89, -50,  50,  89,  18, -75,  75, -18, -89, -50,  50,  89,  18, -75,
+ 64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,
+ 50, -89,  18,  75, -75, -18,  89, -50,  50, -89,  18,  75, -75, -18,  89, -50,
+ 36, -83,  83, -36, -36,  83, -83,  36,  36, -83,  83, -36, -36,  83, -83,  36,
+ 18, -50,  75, -89,  89, -75,  50, -18,  18, -50,  75, -89,  89, -75,  50, -18
+};
+
+ALIGNED(32)
+const int16_t ff_dst7_2x8_coeff_ver[128] = {
+  17, 32,  46,  60,  71,  78,  85,  86,  17, 32,  46,  60,  71,  78,  85,  86,
+  46, 78,  86,  71,  32,  -17, -60, -85, 46, 78,  86,  71,  32,  -17, -60, -85,
+  71, 85,  32,  -46, -86, -60, 17,  78,  71, 85,  32,  -46, -86, -60, 17,  78,
+  85, 46,  -60, -78, 17,  86,  32,  -71, 85, 46,  -60, -78, 17,  86,  32,  -71,
+  86, -17, -85, 32,  78,  -46, -71, 60,  86, -17, -85, 32,  78,  -46, -71, 60,
+  78, -71, -17, 85,  -60, -32, 86,  -46, 78, -71, -17, 85,  -60, -32, 86,  -46,
+  60, -86, 71,  -17, -46, 85,  -78, 32,  60, -86, 71,  -17, -46, 85,  -78, 32,
+  32, -60, 78,  -86, 85,  -71, 46,  -17, 32, -60, 78,  -86, 85,  -71, 46,  -17,
+};
+
+
+ALIGNED(32) const int16_t  fi_dct2_2x8_coeff_ver[128] = {
+ 64,  89,  83,  75,  64,  89,  83,  75,  64,  75,  36, -18,  64,  75,  36, -18,
+ 64,  50,  36,  18,  64,  50,  36,  18, -64, -89, -83, -50, -64, -89, -83, -50,
+ 64,  50, -36, -89,  64,  50, -36, -89,  64,  18, -83, -50,  64,  18, -83, -50,
+-64,  18,  83,  75, -64,  18,  83,  75,  64,  75, -36, -89,  64,  75, -36, -89,
+ 64, -18, -83,  50,  64, -18, -83,  50,  64, -50, -36,  89,  64, -50, -36,  89,
+ 64, -75, -36,  89,  64, -75, -36,  89, -64, -18,  83, -75, -64, -18,  83, -75,
+ 64, -75,  36,  18,  64, -75,  36,  18,  64, -89,  83, -75,  64, -89,  83, -75,
+-64,  89, -83,  50, -64,  89, -83,  50,  64, -50,  36, -18,  64, -50,  36, -18,
+};
+
+ALIGNED(32) const int16_t  fi_dct2_2x16_coeff_ver[512] = {
+ 64,  90,  89,  87,  64,  90,  89,  87,  64,  57,  50,  43,  64,  57,  50,  43,  // 0
+ 83,  80,  75,  70,  83,  80,  75,  70,  36,  25,  18,   9,  36,  25,  18,   9,
+ 64,  87,  75,  57,  64,  87,  75,  57, -64, -80, -89, -90, -64, -80, -89, -90,
+ 36,   9, -18, -43,  36,   9, -18, -43, -83, -70, -50, -25, -83, -70, -50, -25,
+ 64,  80,  50,   9,  64,  80,  50,   9, -64, -25,  18,  57, -64, -25,  18,  57,
+-36, -70, -89, -87, -36, -70, -89, -87,  83,  90,  75,  43,  83,  90,  75,  43,
+ 64,  70,  18, -43,  64,  70,  18, -43,  64,  90,  75,  25,  64,  90,  75,  25,
+-83, -87, -50,   9, -83, -87, -50,   9, -36, -80, -89, -57, -36, -80, -89, -57,
+ 64,  57, -18, -80,  64,  57, -18, -80,  64,  -9, -75, -87,  64,  -9, -75, -87,  // 8
+-83, -25,  50,  90, -83, -25,  50,  90, -36,  43,  89,  70, -36,  43,  89,  70,
+ 64,  43, -50, -90,  64,  43, -50, -90, -64, -87, -18,  70, -64, -87, -18,  70,
+-36,  57,  89,  25, -36,  57,  89,  25,  83,   9, -75, -80,  83,   9, -75, -80,
+ 64,  25, -75, -70,  64,  25, -75, -70, -64,  43,  89,   9, -64,  43,  89,   9,
+ 36,  90,  18, -80,  36,  90,  18, -80, -83, -57,  50,  87, -83, -57,  50,  87,
+ 64,   9, -89, -25,  64,   9, -89, -25,  64,  70, -50, -80,  64,  70, -50, -80,
+ 83,  43, -75, -57,  83,  43, -75, -57,  36,  87, -18, -90,  36,  87, -18, -90,
+ 64,  -9, -89,  25,  64,  -9, -89,  25,  64, -70, -50,  80,  64, -70, -50,  80,  // 16
+ 83, -43, -75,  57,  83, -43, -75,  57,  36, -87, -18,  90,  36, -87, -18,  90,
+ 64, -25, -75,  70,  64, -25, -75,  70, -64, -43,  89,  -9, -64, -43,  89,  -9,
+ 36, -90,  18,  80,  36, -90,  18,  80, -83,  57,  50, -87, -83,  57,  50, -87,
+ 64, -43, -50,  90,  64, -43, -50,  90, -64,  87, -18, -70, -64,  87, -18, -70,
+-36, -57,  89, -25, -36, -57,  89, -25,  83,  -9, -75,  80,  83,  -9, -75,  80,
+ 64, -57, -18,  80,  64, -57, -18,  80,  64,   9, -75,  87,  64,   9, -75,  87,
+-83,  25,  50, -90, -83,  25,  50, -90, -36, -43,  89, -70, -36, -43,  89, -70,
+ 64, -70,  18,  43,  64, -70,  18,  43,  64, -90,  75, -25,  64, -90,  75, -25,  // 24
+-83,  87, -50,  -9, -83,  87, -50,  -9, -36,  80, -89,  57, -36,  80, -89,  57,
+ 64, -80,  50,  -9,  64, -80,  50,  -9, -64,  25,  18, -57, -64,  25,  18, -57,
+-36,  70, -89,  87, -36,  70, -89,  87,  83, -90,  75, -43,  83, -90,  75, -43,
+ 64, -87,  75, -57,  64, -87,  75, -57, -64,  80, -89,  90, -64,  80, -89,  90,
+ 36,  -9, -18,  43,  36,  -9, -18,  43, -83,  70, -50,  25, -83,  70, -50,  25,
+ 64, -90,  89, -87,  64, -90,  89, -87,  64, -57,  50, -43,  64, -57,  50, -43,
+ 83, -80,  75, -70,  83, -80,  75, -70,  36, -25,  18,  -9,  36, -25,  18,  -9,
+};
+
+ALIGNED(32) const int16_t  fi_dct2_2x32_coeff_ver[2048] = {
+ 64,  90,  90,  90,  89,  88,  87,  85,  64,  90,  90,  90,  89,  88,  87,  85,  // 0
+ 83,  82,  80,  78,  75,  73,  70,  67,  83,  82,  80,  78,  75,  73,  70,  67,
+ 64,  61,  57,  54,  50,  46,  43,  38,  64,  61,  57,  54,  50,  46,  43,  38,
+ 36,  31,  25,  22,  18,  13,   9,   4,  36,  31,  25,  22,  18,  13,   9,   4,
+ 64,  90,  87,  82,  75,  67,  57,  46,  64,  90,  87,  82,  75,  67,  57,  46,
+ 36,  22,   9,  -4, -18, -31, -43, -54,  36,  22,   9,  -4, -18, -31, -43, -54,
+-64, -73, -80, -85, -89, -90, -90, -88, -64, -73, -80, -85, -89, -90, -90, -88,
+-83, -78, -70, -61, -50, -38, -25, -13, -83, -78, -70, -61, -50, -38, -25, -13,
+ 64,  88,  80,  67,  50,  31,   9, -13,  64,  88,  80,  67,  50,  31,   9, -13,  // 8
+-36, -54, -70, -82, -89, -90, -87, -78, -36, -54, -70, -82, -89, -90, -87, -78,
+-64, -46, -25,  -4,  18,  38,  57,  73, -64, -46, -25,  -4,  18,  38,  57,  73,
+ 83,  90,  90,  85,  75,  61,  43,  22,  83,  90,  90,  85,  75,  61,  43,  22,
+ 64,  85,  70,  46,  18, -13, -43, -67,  64,  85,  70,  46,  18, -13, -43, -67,
+-83, -90, -87, -73, -50, -22,   9,  38, -83, -90, -87, -73, -50, -22,   9,  38,
+ 64,  82,  90,  88,  75,  54,  25,  -4,  64,  82,  90,  88,  75,  54,  25,  -4,
+-36, -61, -80, -90, -89, -78, -57, -31, -36, -61, -80, -90, -89, -78, -57, -31,
+ 64,  82,  57,  22, -18, -54, -80, -90,  64,  82,  57,  22, -18, -54, -80, -90,  // 16
+-83, -61, -25,  13,  50,  78,  90,  85, -83, -61, -25,  13,  50,  78,  90,  85,
+ 64,  31,  -9, -46, -75, -90, -87, -67,  64,  31,  -9, -46, -75, -90, -87, -67,
+-36,   4,  43,  73,  89,  88,  70,  38, -36,   4,  43,  73,  89,  88,  70,  38,
+ 64,  78,  43,  -4, -50, -82, -90, -73,  64,  78,  43,  -4, -50, -82, -90, -73,
+-36,  13,  57,  85,  89,  67,  25, -22, -36,  13,  57,  85,  89,  67,  25, -22,
+-64, -88, -87, -61, -18,  31,  70,  90, -64, -88, -87, -61, -18,  31,  70,  90,
+ 83,  54,   9, -38, -75, -90, -80, -46,  83,  54,   9, -38, -75, -90, -80, -46,
+ 64,  73,  25, -31, -75, -90, -70, -22,  64,  73,  25, -31, -75, -90, -70, -22,  // 24
+ 36,  78,  90,  67,  18, -38, -80, -90,  36,  78,  90,  67,  18, -38, -80, -90,
+-64, -13,  43,  82,  89,  61,   9, -46, -64, -13,  43,  82,  89,  61,   9, -46,
+-83, -88, -57,  -4,  50,  85,  87,  54, -83, -88, -57,  -4,  50,  85,  87,  54,
+ 64,  67,   9, -54, -89, -78, -25,  38,  64,  67,   9, -54, -89, -78, -25,  38,
+ 83,  85,  43, -22, -75, -90, -57,   4,  83,  85,  43, -22, -75, -90, -57,   4,
+ 64,  90,  70,  13, -50, -88, -80, -31,  64,  90,  70,  13, -50, -88, -80, -31,
+ 36,  82,  87,  46, -18, -73, -90, -61,  36,  82,  87,  46, -18, -73, -90, -61,
+ 64,  61,  -9, -73, -89, -46,  25,  82,  64,  61,  -9, -73, -89, -46,  25,  82,  // 32
+ 83,  31, -43, -88, -75, -13,  57,  90,  83,  31, -43, -88, -75, -13,  57,  90,
+ 64,  -4, -70, -90, -50,  22,  80,  85,  64,  -4, -70, -90, -50,  22,  80,  85,
+ 36, -38, -87, -78, -18,  54,  90,  67,  36, -38, -87, -78, -18,  54,  90,  67,
+ 64,  54, -25, -85, -75,  -4,  70,  88,  64,  54, -25, -85, -75,  -4,  70,  88,
+ 36, -46, -90, -61,  18,  82,  80,  13,  36, -46, -90, -61,  18,  82,  80,  13,
+-64, -90, -43,  38,  89,  67,  -9, -78, -64, -90, -43,  38,  89,  67,  -9, -78,
+-83, -22,  57,  90,  50, -31, -87, -73, -83, -22,  57,  90,  50, -31, -87, -73,
+ 64,  46, -43, -90, -50,  38,  90,  54,  64,  46, -43, -90, -50,  38,  90,  54,  // 40
+-36, -90, -57,  31,  89,  61, -25, -88, -36, -90, -57,  31,  89,  61, -25, -88,
+-64,  22,  87,  67, -18, -85, -70,  13, -64,  22,  87,  67, -18, -85, -70,  13,
+ 83,  73,  -9, -82, -75,   4,  80,  78,  83,  73,  -9, -82, -75,   4,  80,  78,
+ 64,  38, -57, -88, -18,  73,  80,  -4,  64,  38, -57, -88, -18,  73,  80,  -4,
+-83, -67,  25,  90,  50, -46, -90, -31, -83, -67,  25,  90,  50, -46, -90, -31,
+ 64,  85,   9, -78, -75,  13,  87,  61,  64,  85,   9, -78, -75,  13,  87,  61,
+-36, -90, -43,  54,  89,  22, -70, -82, -36, -90, -43,  54,  89,  22, -70, -82,
+ 64,  31, -70, -78,  18,  90,  43, -61,  64,  31, -70, -78,  18,  90,  43, -61,  // 48
+-83,   4,  87,  54, -50, -88,  -9,  82, -83,   4,  87,  54, -50, -88,  -9,  82,
+ 64, -38, -90, -22,  75,  73, -25, -90,  64, -38, -90, -22,  75,  73, -25, -90,
+-36,  67,  80, -13, -89, -46,  57,  85, -36,  67,  80, -13, -89, -46,  57,  85,
+ 64,  22, -80, -61,  50,  85,  -9, -90,  64,  22, -80, -61,  50,  85,  -9, -90,
+-36,  73,  70, -38, -89,  -4,  87,  46, -36,  73,  70, -38, -89,  -4,  87,  46,
+-64, -78,  25,  90,  18, -82, -57,  54, -64, -78,  25,  90,  18, -82, -57,  54,
+ 83, -13, -90, -31,  75,  67, -43, -88,  83, -13, -90, -31,  75,  67, -43, -88,
+ 64,  13, -87, -38,  75,  61, -57, -78,  64,  13, -87, -38,  75,  61, -57, -78,  // 56
+ 36,  88,  -9, -90, -18,  85,  43, -73,  36,  88,  -9, -90, -18,  85,  43, -73,
+-64,  54,  80, -31, -89,   4,  90,  22, -64,  54,  80, -31, -89,   4,  90,  22,
+-83, -46,  70,  67, -50, -82,  25,  90, -83, -46,  70,  67, -50, -82,  25,  90,
+ 64,   4, -90, -13,  89,  22, -87, -31,  64,   4, -90, -13,  89,  22, -87, -31,
+ 83,  38, -80, -46,  75,  54, -70, -61,  83,  38, -80, -46,  75,  54, -70, -61,
+ 64,  67, -57, -73,  50,  78, -43, -82,  64,  67, -57, -73,  50,  78, -43, -82,
+ 36,  85, -25, -88,  18,  90,  -9, -90,  36,  85, -25, -88,  18,  90,  -9, -90,
+ 64,  -4, -90,  13,  89, -22, -87,  31,  64,  -4, -90,  13,  89, -22, -87,  31,  // 64
+ 83, -38, -80,  46,  75, -54, -70,  61,  83, -38, -80,  46,  75, -54, -70,  61,
+ 64, -67, -57,  73,  50, -78, -43,  82,  64, -67, -57,  73,  50, -78, -43,  82,
+ 36, -85, -25,  88,  18, -90,  -9,  90,  36, -85, -25,  88,  18, -90,  -9,  90,
+ 64, -13, -87,  38,  75, -61, -57,  78,  64, -13, -87,  38,  75, -61, -57,  78,
+ 36, -88,  -9,  90, -18, -85,  43,  73,  36, -88,  -9,  90, -18, -85,  43,  73,
+-64, -54,  80,  31, -89,  -4,  90, -22, -64, -54,  80,  31, -89,  -4,  90, -22,
+-83,  46,  70, -67, -50,  82,  25, -90, -83,  46,  70, -67, -50,  82,  25, -90,
+ 64, -22, -80,  61,  50, -85,  -9,  90,  64, -22, -80,  61,  50, -85,  -9,  90,  // 72
+-36, -73,  70,  38, -89,   4,  87, -46, -36, -73,  70,  38, -89,   4,  87, -46,
+-64,  78,  25, -90,  18,  82, -57, -54, -64,  78,  25, -90,  18,  82, -57, -54,
+ 83,  13, -90,  31,  75, -67, -43,  88,  83,  13, -90,  31,  75, -67, -43,  88,
+ 64, -31, -70,  78,  18, -90,  43,  61,  64, -31, -70,  78,  18, -90,  43,  61,
+-83,  -4,  87, -54, -50,  88,  -9, -82, -83,  -4,  87, -54, -50,  88,  -9, -82,
+ 64,  38, -90,  22,  75, -73, -25,  90,  64,  38, -90,  22,  75, -73, -25,  90,
+-36, -67,  80,  13, -89,  46,  57, -85, -36, -67,  80,  13, -89,  46,  57, -85,
+ 64, -38, -57,  88, -18, -73,  80,   4,  64, -38, -57,  88, -18, -73,  80,   4,  // 80
+-83,  67,  25, -90,  50,  46, -90,  31, -83,  67,  25, -90,  50,  46, -90,  31,
+ 64, -85,   9,  78, -75, -13,  87, -61,  64, -85,   9,  78, -75, -13,  87, -61,
+-36,  90, -43, -54,  89, -22, -70,  82, -36,  90, -43, -54,  89, -22, -70,  82,
+ 64, -46, -43,  90, -50, -38,  90, -54,  64, -46, -43,  90, -50, -38,  90, -54,
+-36,  90, -57, -31,  89, -61, -25,  88, -36,  90, -57, -31,  89, -61, -25,  88,
+-64, -22,  87, -67, -18,  85, -70, -13, -64, -22,  87, -67, -18,  85, -70, -13,
+ 83, -73,  -9,  82, -75,  -4,  80, -78,  83, -73,  -9,  82, -75,  -4,  80, -78,
+ 64, -54, -25,  85, -75,   4,  70, -88,  64, -54, -25,  85, -75,   4,  70, -88,  // 88
+ 36,  46, -90,  61,  18, -82,  80, -13,  36,  46, -90,  61,  18, -82,  80, -13,
+-64,  90, -43, -38,  89, -67,  -9,  78, -64,  90, -43, -38,  89, -67,  -9,  78,
+-83,  22,  57, -90,  50,  31, -87,  73, -83,  22,  57, -90,  50,  31, -87,  73,
+ 64, -61,  -9,  73, -89,  46,  25, -82,  64, -61,  -9,  73, -89,  46,  25, -82,
+ 83, -31, -43,  88, -75,  13,  57, -90,  83, -31, -43,  88, -75,  13,  57, -90,
+ 64,   4, -70,  90, -50, -22,  80, -85,  64,   4, -70,  90, -50, -22,  80, -85,
+ 36,  38, -87,  78, -18, -54,  90, -67,  36,  38, -87,  78, -18, -54,  90, -67,
+ 64, -67,   9,  54, -89,  78, -25, -38,  64, -67,   9,  54, -89,  78, -25, -38,  // 96
+ 83, -85,  43,  22, -75,  90, -57,  -4,  83, -85,  43,  22, -75,  90, -57,  -4,
+ 64, -90,  70, -13, -50,  88, -80,  31,  64, -90,  70, -13, -50,  88, -80,  31,
+ 36, -82,  87, -46, -18,  73, -90,  61,  36, -82,  87, -46, -18,  73, -90,  61,
+ 64, -73,  25,  31, -75,  90, -70,  22,  64, -73,  25,  31, -75,  90, -70,  22,
+ 36, -78,  90, -67,  18,  38, -80,  90,  36, -78,  90, -67,  18,  38, -80,  90,
+-64,  13,  43, -82,  89, -61,   9,  46, -64,  13,  43, -82,  89, -61,   9,  46,
+-83,  88, -57,   4,  50, -85,  87, -54, -83,  88, -57,   4,  50, -85,  87, -54,
+ 64, -78,  43,   4, -50,  82, -90,  73,  64, -78,  43,   4, -50,  82, -90,  73,  // 104
+-36, -13,  57, -85,  89, -67,  25,  22, -36, -13,  57, -85,  89, -67,  25,  22,
+-64,  88, -87,  61, -18, -31,  70, -90, -64,  88, -87,  61, -18, -31,  70, -90,
+ 83, -54,   9,  38, -75,  90, -80,  46,  83, -54,   9,  38, -75,  90, -80,  46,
+ 64, -82,  57, -22, -18,  54, -80,  90,  64, -82,  57, -22, -18,  54, -80,  90,
+-83,  61, -25, -13,  50, -78,  90, -85, -83,  61, -25, -13,  50, -78,  90, -85,
+ 64, -31,  -9,  46, -75,  90, -87,  67,  64, -31,  -9,  46, -75,  90, -87,  67,
+-36,  -4,  43, -73,  89, -88,  70, -38, -36,  -4,  43, -73,  89, -88,  70, -38,
+ 64, -85,  70, -46,  18,  13, -43,  67,  64, -85,  70, -46,  18,  13, -43,  67,  // 112
+-83,  90, -87,  73, -50,  22,   9, -38, -83,  90, -87,  73, -50,  22,   9, -38,
+ 64, -82,  90, -88,  75, -54,  25,   4,  64, -82,  90, -88,  75, -54,  25,   4,
+-36,  61, -80,  90, -89,  78, -57,  31, -36,  61, -80,  90, -89,  78, -57,  31,
+ 64, -88,  80, -67,  50, -31,   9,  13,  64, -88,  80, -67,  50, -31,   9,  13,
+-36,  54, -70,  82, -89,  90, -87,  78, -36,  54, -70,  82, -89,  90, -87,  78,
+-64,  46, -25,   4,  18, -38,  57, -73, -64,  46, -25,   4,  18, -38,  57, -73,
+ 83, -90,  90, -85,  75, -61,  43, -22,  83, -90,  90, -85,  75, -61,  43, -22,
+ 64, -90,  87, -82,  75, -67,  57, -46,  64, -90,  87, -82,  75, -67,  57, -46,  // 120
+ 36, -22,   9,   4, -18,  31, -43,  54,  36, -22,   9,   4, -18,  31, -43,  54,
+-64,  73, -80,  85, -89,  90, -90,  88, -64,  73, -80,  85, -89,  90, -90,  88,
+-83,  78, -70,  61, -50,  38, -25,  13, -83,  78, -70,  61, -50,  38, -25,  13,
+ 64, -90,  90, -90,  89, -88,  87, -85,  64, -90,  90, -90,  89, -88,  87, -85,
+ 83, -82,  80, -78,  75, -73,  70, -67,  83, -82,  80, -78,  75, -73,  70, -67,
+ 64, -61,  57, -54,  50, -46,  43, -38,  64, -61,  57, -54,  50, -46,  43, -38,
+ 36, -31,  25, -22,  18, -13,   9,  -4,  36, -31,  25, -22,  18, -13,   9,  -4,
+};
+
+
+// 4xN
+ALIGNED(32) const int16_t  ff_dct2_4x8_coeff_ver[256] = {
+ 64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  // 0
+ 64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,
+ 89,  75,  50,  18,  89,  75,  50,  18,  89,  75,  50,  18,  89,  75,  50,  18,
+-18, -50, -75, -89, -18, -50, -75, -89, -18, -50, -75, -89, -18, -50, -75, -89,
+ 83,  36, -36, -83,  83,  36, -36, -83,  83,  36, -36, -83,  83,  36, -36, -83,
+-83, -36,  36,  83, -83, -36,  36,  83, -83, -36,  36,  83, -83, -36,  36,  83,
+ 75, -18, -89, -50,  75, -18, -89, -50,  75, -18, -89, -50,  75, -18, -89, -50,
+ 50,  89,  18, -75,  50,  89,  18, -75,  50,  89,  18, -75,  50,  89,  18, -75,
+ 64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,  // 8
+ 64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,
+ 50, -89,  18,  75,  50, -89,  18,  75,  50, -89,  18,  75,  50, -89,  18,  75,
+-75, -18,  89, -50, -75, -18,  89, -50, -75, -18,  89, -50, -75, -18,  89, -50,
+ 36, -83,  83, -36,  36, -83,  83, -36,  36, -83,  83, -36,  36, -83,  83, -36,
+-36,  83, -83,  36, -36,  83, -83,  36, -36,  83, -83,  36, -36,  83, -83,  36,
+ 18, -50,  75, -89,  18, -50,  75, -89,  18, -50,  75, -89,  18, -50,  75, -89,
+ 89, -75,  50, -18,  89, -75,  50, -18,  89, -75,  50, -18,  89, -75,  50, -18,
+};
+
+ALIGNED(32) const int16_t  ff_dst7_4x8_coeff_ver[256] = {
+ 17,  32,  46,  60,  17,  32,  46,  60,  17,  32,  46,  60,  17,  32,  46,  60,  // 0
+ 71,  78,  85,  86,  71,  78,  85,  86,  71,  78,  85,  86,  71,  78,  85,  86,
+ 46,  78,  86,  71,  46,  78,  86,  71,  46,  78,  86,  71,  46,  78,  86,  71,
+ 32, -17, -60, -85,  32, -17, -60, -85,  32, -17, -60, -85,  32, -17, -60, -85,
+ 71,  85,  32, -46,  71,  85,  32, -46,  71,  85,  32, -46,  71,  85,  32, -46,
+-86, -60,  17,  78, -86, -60,  17,  78, -86, -60,  17,  78, -86, -60,  17,  78,
+ 85,  46, -60, -78,  85,  46, -60, -78,  85,  46, -60, -78,  85,  46, -60, -78,
+ 17,  86,  32, -71,  17,  86,  32, -71,  17,  86,  32, -71,  17,  86,  32, -71,
+ 86, -17, -85,  32,  86, -17, -85,  32,  86, -17, -85,  32,  86, -17, -85,  32,  // 8
+ 78, -46, -71,  60,  78, -46, -71,  60,  78, -46, -71,  60,  78, -46, -71,  60,
+ 78, -71, -17,  85,  78, -71, -17,  85,  78, -71, -17,  85,  78, -71, -17,  85,
+-60, -32,  86, -46, -60, -32,  86, -46, -60, -32,  86, -46, -60, -32,  86, -46,
+ 60, -86,  71, -17,  60, -86,  71, -17,  60, -86,  71, -17,  60, -86,  71, -17,
+-46,  85, -78,  32, -46,  85, -78,  32, -46,  85, -78,  32, -46,  85, -78,  32,
+ 32, -60,  78, -86,  32, -60,  78, -86,  32, -60,  78, -86,  32, -60,  78, -86,
+ 85, -71,  46, -17,  85, -71,  46, -17,  85, -71,  46, -17,  85, -71,  46, -17,
+};
+
+ALIGNED(32) const int16_t  ff_dct8_4x8_coeff_ver[256] = {
+ 86,  85,  78,  71,  86,  85,  78,  71,  86,  85,  78,  71,  86,  85,  78,  71,  // 0
+ 60,  46,  32,  17,  60,  46,  32,  17,  60,  46,  32,  17,  60,  46,  32,  17,
+ 85,  60,  17, -32,  85,  60,  17, -32,  85,  60,  17, -32,  85,  60,  17, -32,
+-71, -86, -78, -46, -71, -86, -78, -46, -71, -86, -78, -46, -71, -86, -78, -46,
+ 78,  17, -60, -86,  78,  17, -60, -86,  78,  17, -60, -86,  78,  17, -60, -86,
+-46,  32,  85,  71, -46,  32,  85,  71, -46,  32,  85,  71, -46,  32,  85,  71,
+ 71, -32, -86, -17,  71, -32, -86, -17,  71, -32, -86, -17,  71, -32, -86, -17,
+ 78,  60, -46, -85,  78,  60, -46, -85,  78,  60, -46, -85,  78,  60, -46, -85,
+ 60, -71, -46,  78,  60, -71, -46,  78,  60, -71, -46,  78,  60, -71, -46,  78,  // 8
+ 32, -85, -17,  86,  32, -85, -17,  86,  32, -85, -17,  86,  32, -85, -17,  86,
+ 46, -86,  32,  60,  46, -86,  32,  60,  46, -86,  32,  60,  46, -86,  32,  60,
+-85,  17,  71, -78, -85,  17,  71, -78, -85,  17,  71, -78, -85,  17,  71, -78,
+ 32, -78,  85, -46,  32, -78,  85, -46,  32, -78,  85, -46,  32, -78,  85, -46,
+-17,  71, -86,  60, -17,  71, -86,  60, -17,  71, -86,  60, -17,  71, -86,  60,
+ 17, -46,  71, -85,  17, -46,  71, -85,  17, -46,  71, -85,  17, -46,  71, -85,
+ 86, -78,  60, -32,  86, -78,  60, -32,  86, -78,  60, -32,  86, -78,  60, -32,
+};
+
+
+ALIGNED(32) const int16_t  fi_dct2_4xN_coeff_hor[64] = {
+ 64,  83,  64,  36,  64,  83,  64,  36,  64,  83,  64,  36,  64,  83,  64,  36,
+ 64,  36, -64, -83,  64,  36, -64, -83,  64,  36, -64, -83,  64,  36, -64, -83,
+ 64, -36, -64,  83,  64, -36, -64,  83,  64, -36, -64,  83,  64, -36, -64,  83,
+ 64, -83,  64, -36,  64, -83,  64, -36,  64, -83,  64, -36,  64, -83,  64, -36,
+};
+
+ALIGNED(32) const int16_t  fi_dst7_4xN_coeff_hor[64] = {
+ 29,  74,  84,  55,  29,  74,  84,  55,  29,  74,  84,  55,  29,  74,  84,  55,
+ 55,  74, -29, -84,  55,  74, -29, -84,  55,  74, -29, -84,  55,  74, -29, -84,
+ 74,   0, -74,  74,  74,   0, -74,  74,  74,   0, -74,  74,  74,   0, -74,  74,
+ 84, -74,  55, -29,  84, -74,  55, -29,  84, -74,  55, -29,  84, -74,  55, -29,
+};
+
+ALIGNED(32) const int16_t  fi_dct8_4xN_coeff_hor[64] = { // TODO: this is probably identical to forward table, remove this if unnecessary
+ 84,  74,  55,  29,  84,  74,  55,  29,  84,  74,  55,  29,  84,  74,  55,  29,
+ 74,   0, -74, -74,  74,   0, -74, -74,  74,   0, -74, -74,  74,   0, -74, -74,
+ 55, -74, -29,  84,  55, -74, -29,  84,  55, -74, -29,  84,  55, -74, -29,  84,
+ 29, -74,  84, -55,  29, -74,  84, -55,  29, -74,  84, -55,  29, -74,  84, -55,
+};
+
+
+ALIGNED(32) const int16_t  fi_dct2_4x8_coeff_hor[128] = {
+ 64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,
+ 64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,
+ 64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,
+ 64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,
+ 83,  36,  83,  36,  83,  36,  83,  36,  83,  36,  83,  36,  83,  36,  83,  36,
+ 36, -83,  36, -83,  36, -83,  36, -83,  36, -83,  36, -83,  36, -83,  36, -83,
+-36,  83, -36,  83, -36,  83, -36,  83, -36,  83, -36,  83, -36,  83, -36,  83,
+-83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36,
+};
+
+ALIGNED(32) const int16_t  fi_dst7_4x8_coeff_hor[128] = {
+ 29,  84,  29,  84,  29,  84,  29,  84,  29,  84,  29,  84,  29,  84,  29,  84,
+ 55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,
+ 74, -74,  74, -74,  74, -74,  74, -74,  74, -74,  74, -74,  74, -74,  74, -74,
+ 84,  55,  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,
+ 74,  55,  74,  55,  74,  55,  74,  55,  74,  55,  74,  55,  74,  55,  74,  55,
+ 74, -84,  74, -84,  74, -84,  74, -84,  74, -84,  74, -84,  74, -84,  74, -84,
+  0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,
+-74, -29, -74, -29, -74, -29, -74, -29, -74, -29, -74, -29, -74, -29, -74, -29,
+};
+
+ALIGNED(32) const int16_t  fi_dct8_4x8_coeff_hor[128] = { // TODO: this is probably identical to forward table, remove this if unnecessary
+ 84,  55,  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,
+ 74, -74,  74, -74,  74, -74,  74, -74,  74, -74,  74, -74,  74, -74,  74, -74,
+ 55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,
+ 29,  84,  29,  84,  29,  84,  29,  84,  29,  84,  29,  84,  29,  84,  29,  84,
+ 74,  29,  74,  29,  74,  29,  74,  29,  74,  29,  74,  29,  74,  29,  74,  29,
+  0, -74,   0, -74,   0, -74,   0, -74,   0, -74,   0, -74,   0, -74,   0, -74,
+-74,  84, -74,  84, -74,  84, -74,  84, -74,  84, -74,  84, -74,  84, -74,  84,
+-74, -55, -74, -55, -74, -55, -74, -55, -74, -55, -74, -55, -74, -55, -74, -55,
+};
+
+
+ALIGNED(32) const int16_t  fi_dct2_4x8_coeff_ver[256] = {
+ 64,  89,  83,  75,  64,  89,  83,  75,  64,  89,  83,  75,  64,  89,  83,  75,  // 0
+ 64,  50,  36,  18,  64,  50,  36,  18,  64,  50,  36,  18,  64,  50,  36,  18,
+ 64,  75,  36, -18,  64,  75,  36, -18,  64,  75,  36, -18,  64,  75,  36, -18,
+-64, -89, -83, -50, -64, -89, -83, -50, -64, -89, -83, -50, -64, -89, -83, -50,
+ 64,  50, -36, -89,  64,  50, -36, -89,  64,  50, -36, -89,  64,  50, -36, -89,
+-64,  18,  83,  75, -64,  18,  83,  75, -64,  18,  83,  75, -64,  18,  83,  75,
+ 64,  18, -83, -50,  64,  18, -83, -50,  64,  18, -83, -50,  64,  18, -83, -50,
+ 64,  75, -36, -89,  64,  75, -36, -89,  64,  75, -36, -89,  64,  75, -36, -89,
+ 64, -18, -83,  50,  64, -18, -83,  50,  64, -18, -83,  50,  64, -18, -83,  50,  // 8
+ 64, -75, -36,  89,  64, -75, -36,  89,  64, -75, -36,  89,  64, -75, -36,  89,
+ 64, -50, -36,  89,  64, -50, -36,  89,  64, -50, -36,  89,  64, -50, -36,  89,
+-64, -18,  83, -75, -64, -18,  83, -75, -64, -18,  83, -75, -64, -18,  83, -75,
+ 64, -75,  36,  18,  64, -75,  36,  18,  64, -75,  36,  18,  64, -75,  36,  18,
+-64,  89, -83,  50, -64,  89, -83,  50, -64,  89, -83,  50, -64,  89, -83,  50,
+ 64, -89,  83, -75,  64, -89,  83, -75,  64, -89,  83, -75,  64, -89,  83, -75,
+ 64, -50,  36, -18,  64, -50,  36, -18,  64, -50,  36, -18,  64, -50,  36, -18,
+};
+
+ALIGNED(32) const int16_t  fi_dst7_4x8_coeff_ver[256] = {
+ 17,  46,  71,  85,  17,  46,  71,  85,  17,  46,  71,  85,  17,  46,  71,  85,  // 0
+ 86,  78,  60,  32,  86,  78,  60,  32,  86,  78,  60,  32,  86,  78,  60,  32,
+ 32,  78,  85,  46,  32,  78,  85,  46,  32,  78,  85,  46,  32,  78,  85,  46,
+-17, -71, -86, -60, -17, -71, -86, -60, -17, -71, -86, -60, -17, -71, -86, -60,
+ 46,  86,  32, -60,  46,  86,  32, -60,  46,  86,  32, -60,  46,  86,  32, -60,
+-85, -17,  71,  78, -85, -17,  71,  78, -85, -17,  71,  78, -85, -17,  71,  78,
+ 60,  71, -46, -78,  60,  71, -46, -78,  60,  71, -46, -78,  60,  71, -46, -78,
+ 32,  85, -17, -86,  32,  85, -17, -86,  32,  85, -17, -86,  32,  85, -17, -86,
+ 71,  32, -86,  17,  71,  32, -86,  17,  71,  32, -86,  17,  71,  32, -86,  17,  // 8
+ 78, -60, -46,  85,  78, -60, -46,  85,  78, -60, -46,  85,  78, -60, -46,  85,
+ 78, -17, -60,  86,  78, -17, -60,  86,  78, -17, -60,  86,  78, -17, -60,  86,
+-46, -32,  85, -71, -46, -32,  85, -71, -46, -32,  85, -71, -46, -32,  85, -71,
+ 85, -60,  17,  32,  85, -60,  17,  32,  85, -60,  17,  32,  85, -60,  17,  32,
+-71,  86, -78,  46, -71,  86, -78,  46, -71,  86, -78,  46, -71,  86, -78,  46,
+ 86, -85,  78, -71,  86, -85,  78, -71,  86, -85,  78, -71,  86, -85,  78, -71,
+ 60, -46,  32, -17,  60, -46,  32, -17,  60, -46,  32, -17,  60, -46,  32, -17,
+};
+
+ALIGNED(32) const int16_t  fi_dct8_4x8_coeff_ver[256] = { // TODO: this is probably identical to forward table, remove this if unnecessary
+ 86,  85,  78,  71,  86,  85,  78,  71,  86,  85,  78,  71,  86,  85,  78,  71,  // 0
+ 60,  46,  32,  17,  60,  46,  32,  17,  60,  46,  32,  17,  60,  46,  32,  17,
+ 85,  60,  17, -32,  85,  60,  17, -32,  85,  60,  17, -32,  85,  60,  17, -32,
+-71, -86, -78, -46, -71, -86, -78, -46, -71, -86, -78, -46, -71, -86, -78, -46,
+ 78,  17, -60, -86,  78,  17, -60, -86,  78,  17, -60, -86,  78,  17, -60, -86,
+-46,  32,  85,  71, -46,  32,  85,  71, -46,  32,  85,  71, -46,  32,  85,  71,
+ 71, -32, -86, -17,  71, -32, -86, -17,  71, -32, -86, -17,  71, -32, -86, -17,
+ 78,  60, -46, -85,  78,  60, -46, -85,  78,  60, -46, -85,  78,  60, -46, -85,
+ 60, -71, -46,  78,  60, -71, -46,  78,  60, -71, -46,  78,  60, -71, -46,  78,  // 8
+ 32, -85, -17,  86,  32, -85, -17,  86,  32, -85, -17,  86,  32, -85, -17,  86,
+ 46, -86,  32,  60,  46, -86,  32,  60,  46, -86,  32,  60,  46, -86,  32,  60,
+-85,  17,  71, -78, -85,  17,  71, -78, -85,  17,  71, -78, -85,  17,  71, -78,
+ 32, -78,  85, -46,  32, -78,  85, -46,  32, -78,  85, -46,  32, -78,  85, -46,
+-17,  71, -86,  60, -17,  71, -86,  60, -17,  71, -86,  60, -17,  71, -86,  60,
+ 17, -46,  71, -85,  17, -46,  71, -85,  17, -46,  71, -85,  17, -46,  71, -85,
+ 86, -78,  60, -32,  86, -78,  60, -32,  86, -78,  60, -32,  86, -78,  60, -32,
+};
+
+
+ALIGNED(32) const int16_t  fi_dct2_4x16_coeff_hor[128] = {
+ 64,  83,  64,  83,  64,  83,  64,  83,  64,  83,  64,  83,  64,  83,  64,  83,
+ 64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,
+ 64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,
+-64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83,
+ 64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,
+-64,  83, -64,  83, -64,  83, -64,  83, -64,  83, -64,  83, -64,  83, -64,  83,
+ 64, -83,  64, -83,  64, -83,  64, -83,  64, -83,  64, -83,  64, -83,  64, -83,
+ 64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,
+};
+
+ALIGNED(32) const int16_t  fi_dst7_4x16_coeff_hor[128] = {
+ 29,  74,  29,  74,  29,  74,  29,  74,  29,  74,  29,  74,  29,  74,  29,  74,
+ 84,  55,  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,
+ 55,  74,  55,  74,  55,  74,  55,  74,  55,  74,  55,  74,  55,  74,  55,  74,
+-29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84,
+ 74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,
+-74,  74, -74,  74, -74,  74, -74,  74, -74,  74, -74,  74, -74,  74, -74,  74,
+ 84, -74,  84, -74,  84, -74,  84, -74,  84, -74,  84, -74,  84, -74,  84, -74,
+ 55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,
+};
+
+ALIGNED(32) const int16_t  fi_dct8_4x16_coeff_hor[128] = { // TODO: this is probably identical to forward table, remove this if unnecessary
+ 84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,
+ 55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,
+ 74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,
+-74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74,
+ 55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,
+-29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84,
+ 29, -74,  29, -74,  29, -74,  29, -74,  29, -74,  29, -74,  29, -74,  29, -74,
+ 84, -55,  84, -55,  84, -55,  84, -55,  84, -55,  84, -55,  84, -55,  84, -55,
+};
+
+
+ALIGNED(32) const int16_t  fi_dct2_4x16_coeff_ver[512] = {
+ 64,  90,  89,  87,  83,  80,  75,  70,  64,  90,  89,  87,  83,  80,  75,  70,  // 0
+ 64,  57,  50,  43,  36,  25,  18,   9,  64,  57,  50,  43,  36,  25,  18,   9,
+ 64,  87,  75,  57,  36,   9, -18, -43,  64,  87,  75,  57,  36,   9, -18, -43,
+-64, -80, -89, -90, -83, -70, -50, -25, -64, -80, -89, -90, -83, -70, -50, -25,
+ 64,  80,  50,   9, -36, -70, -89, -87,  64,  80,  50,   9, -36, -70, -89, -87,
+-64, -25,  18,  57,  83,  90,  75,  43, -64, -25,  18,  57,  83,  90,  75,  43,
+ 64,  70,  18, -43, -83, -87, -50,   9,  64,  70,  18, -43, -83, -87, -50,   9,
+ 64,  90,  75,  25, -36, -80, -89, -57,  64,  90,  75,  25, -36, -80, -89, -57,
+ 64,  57, -18, -80, -83, -25,  50,  90,  64,  57, -18, -80, -83, -25,  50,  90,  // 8
+ 64,  -9, -75, -87, -36,  43,  89,  70,  64,  -9, -75, -87, -36,  43,  89,  70,
+ 64,  43, -50, -90, -36,  57,  89,  25,  64,  43, -50, -90, -36,  57,  89,  25,
+-64, -87, -18,  70,  83,   9, -75, -80, -64, -87, -18,  70,  83,   9, -75, -80,
+ 64,  25, -75, -70,  36,  90,  18, -80,  64,  25, -75, -70,  36,  90,  18, -80,
+-64,  43,  89,   9, -83, -57,  50,  87, -64,  43,  89,   9, -83, -57,  50,  87,
+ 64,   9, -89, -25,  83,  43, -75, -57,  64,   9, -89, -25,  83,  43, -75, -57,
+ 64,  70, -50, -80,  36,  87, -18, -90,  64,  70, -50, -80,  36,  87, -18, -90,
+ 64,  -9, -89,  25,  83, -43, -75,  57,  64,  -9, -89,  25,  83, -43, -75,  57,  // 16
+ 64, -70, -50,  80,  36, -87, -18,  90,  64, -70, -50,  80,  36, -87, -18,  90,
+ 64, -25, -75,  70,  36, -90,  18,  80,  64, -25, -75,  70,  36, -90,  18,  80,
+-64, -43,  89,  -9, -83,  57,  50, -87, -64, -43,  89,  -9, -83,  57,  50, -87,
+ 64, -43, -50,  90, -36, -57,  89, -25,  64, -43, -50,  90, -36, -57,  89, -25,
+-64,  87, -18, -70,  83,  -9, -75,  80, -64,  87, -18, -70,  83,  -9, -75,  80,
+ 64, -57, -18,  80, -83,  25,  50, -90,  64, -57, -18,  80, -83,  25,  50, -90,
+ 64,   9, -75,  87, -36, -43,  89, -70,  64,   9, -75,  87, -36, -43,  89, -70,
+ 64, -70,  18,  43, -83,  87, -50,  -9,  64, -70,  18,  43, -83,  87, -50,  -9,  // 24
+ 64, -90,  75, -25, -36,  80, -89,  57,  64, -90,  75, -25, -36,  80, -89,  57,
+ 64, -80,  50,  -9, -36,  70, -89,  87,  64, -80,  50,  -9, -36,  70, -89,  87,
+-64,  25,  18, -57,  83, -90,  75, -43, -64,  25,  18, -57,  83, -90,  75, -43,
+ 64, -87,  75, -57,  36,  -9, -18,  43,  64, -87,  75, -57,  36,  -9, -18,  43,
+-64,  80, -89,  90, -83,  70, -50,  25, -64,  80, -89,  90, -83,  70, -50,  25,
+ 64, -90,  89, -87,  83, -80,  75, -70,  64, -90,  89, -87,  83, -80,  75, -70,
+ 64, -57,  50, -43,  36, -25,  18,  -9,  64, -57,  50, -43,  36, -25,  18,  -9,
+};
+
+ALIGNED(32) const int16_t  fi_dst7_4x16_coeff_ver[512] = {
+  8,  25,  40,  55,  68,  77,  85,  88,   8,  25,  40,  55,  68,  77,  85,  88,  // 0
+ 88,  87,  81,  73,  62,  48,  33,  17,  88,  87,  81,  73,  62,  48,  33,  17,
+ 17,  48,  73,  87,  88,  77,  55,  25,  17,  48,  73,  87,  88,  77,  55,  25,
+ -8, -40, -68, -85, -88, -81, -62, -33,  -8, -40, -68, -85, -88, -81, -62, -33,
+ 25,  68,  88,  81,  48,   0, -48, -81,  25,  68,  88,  81,  48,   0, -48, -81,
+-88, -68, -25,  25,  68,  88,  81,  48, -88, -68, -25,  25,  68,  88,  81,  48,
+ 33,  81,  85,  40, -25, -77, -87, -48,  33,  81,  85,  40, -25, -77, -87, -48,
+ 17,  73,  88,  55,  -8, -68, -88, -62,  17,  73,  88,  55,  -8, -68, -88, -62,
+ 40,  88,  62, -17, -81, -77,  -8,  68,  40,  88,  62, -17, -81, -77,  -8,  68,  // 8
+ 87,  33, -48, -88, -55,  25,  85,  73,  87,  33, -48, -88, -55,  25,  85,  73,
+ 48,  88,  25, -68, -81,   0,  81,  68,  48,  88,  25, -68, -81,   0,  81,  68,
+-25, -88, -48,  48,  88,  25, -68, -81, -25, -88, -48,  48,  88,  25, -68, -81,
+ 55,  81, -17, -88, -25,  77,  62, -48,  55,  81, -17, -88, -25,  77,  62, -48,
+-85,   8,  88,  33, -73, -68,  40,  87, -85,   8,  88,  33, -73, -68,  40,  87,
+ 62,  68, -55, -73,  48,  77, -40, -81,  62,  68, -55, -73,  48,  77, -40, -81,
+ 33,  85, -25, -87,  17,  88,  -8, -88,  33,  85, -25, -87,  17,  88,  -8, -88,
+ 68,  48, -81, -25,  88,   0, -88,  25,  68,  48, -81, -25,  88,   0, -88,  25,  // 16
+ 81, -48, -68,  68,  48, -81, -25,  88,  81, -48, -68,  68,  48, -81, -25,  88,
+ 73,  25, -88,  33,  68, -77, -17,  88,  73,  25, -88,  33,  68, -77, -17,  88,
+-40, -62,  81,   8, -87,  48,  55, -85, -40, -62,  81,   8, -87,  48,  55, -85,
+ 77,   0, -77,  77,   0, -77,  77,   0,  77,   0, -77,  77,   0, -77,  77,   0,
+-77,  77,   0, -77,  77,   0, -77,  77, -77,  77,   0, -77,  77,   0, -77,  77,
+ 81, -25, -48,  88, -68,   0,  68, -88,  81, -25, -48,  88, -68,   0,  68, -88,
+ 48,  25, -81,  81, -25, -48,  88, -68,  48,  25, -81,  81, -25, -48,  88, -68,
+ 85, -48,  -8,  62, -88,  77, -33, -25,  85, -48,  -8,  62, -88,  77, -33, -25,  // 24
+ 73, -88,  68, -17, -40,  81, -87,  55,  73, -88,  68, -17, -40,  81, -87,  55,
+ 87, -68,  33,   8, -48,  77, -88,  81,  87, -68,  33,   8, -48,  77, -88,  81,
+-55,  17,  25, -62,  85, -88,  73, -40, -55,  17,  25, -62,  85, -88,  73, -40,
+ 88, -81,  68, -48,  25,   0, -25,  48,  88, -81,  68, -48,  25,   0, -25,  48,
+-68,  81, -88,  88, -81,  68, -48,  25, -68,  81, -88,  88, -81,  68, -48,  25,
+ 88, -88,  87, -85,  81, -77,  73, -68,  88, -88,  87, -85,  81, -77,  73, -68,
+ 62, -55,  48, -40,  33, -25,  17,  -8,  62, -55,  48, -40,  33, -25,  17,  -8,
+};
+
+ALIGNED(32) const int16_t  fi_dct8_4x16_coeff_ver[512] = {
+ 88,  88,  87,  85,  81,  77,  73,  68,  88,  88,  87,  85,  81,  77,  73,  68,  // 0
+ 62,  55,  48,  40,  33,  25,  17,   8,  62,  55,  48,  40,  33,  25,  17,   8,
+ 88,  81,  68,  48,  25,   0, -25, -48,  88,  81,  68,  48,  25,   0, -25, -48,
+-68, -81, -88, -88, -81, -68, -48, -25, -68, -81, -88, -88, -81, -68, -48, -25,
+ 87,  68,  33,  -8, -48, -77, -88, -81,  87,  68,  33,  -8, -48, -77, -88, -81,
+-55, -17,  25,  62,  85,  88,  73,  40, -55, -17,  25,  62,  85,  88,  73,  40,
+ 85,  48,  -8, -62, -88, -77, -33,  25,  85,  48,  -8, -62, -88, -77, -33,  25,
+ 73,  88,  68,  17, -40, -81, -87, -55,  73,  88,  68,  17, -40, -81, -87, -55,
+ 81,  25, -48, -88, -68,   0,  68,  88,  81,  25, -48, -88, -68,   0,  68,  88,  // 8
+ 48, -25, -81, -81, -25,  48,  88,  68,  48, -25, -81, -81, -25,  48,  88,  68,
+ 77,   0, -77, -77,   0,  77,  77,   0,  77,   0, -77, -77,   0,  77,  77,   0,
+-77, -77,   0,  77,  77,   0, -77, -77, -77, -77,   0,  77,  77,   0, -77, -77,
+ 73, -25, -88, -33,  68,  77, -17, -88,  73, -25, -88, -33,  68,  77, -17, -88,
+-40,  62,  81,  -8, -87, -48,  55,  85, -40,  62,  81,  -8, -87, -48,  55,  85,
+ 68, -48, -81,  25,  88,   0, -88, -25,  68, -48, -81,  25,  88,   0, -88, -25,
+ 81,  48, -68, -68,  48,  81, -25, -88,  81,  48, -68, -68,  48,  81, -25, -88,
+ 62, -68, -55,  73,  48, -77, -40,  81,  62, -68, -55,  73,  48, -77, -40,  81,  // 16
+ 33, -85, -25,  87,  17, -88,  -8,  88,  33, -85, -25,  87,  17, -88,  -8,  88,
+ 55, -81, -17,  88, -25, -77,  62,  48,  55, -81, -17,  88, -25, -77,  62,  48,
+-85,  -8,  88, -33, -73,  68,  40, -87, -85,  -8,  88, -33, -73,  68,  40, -87,
+ 48, -88,  25,  68, -81,   0,  81, -68,  48, -88,  25,  68, -81,   0,  81, -68,
+-25,  88, -48, -48,  88, -25, -68,  81, -25,  88, -48, -48,  88, -25, -68,  81,
+ 40, -88,  62,  17, -81,  77,  -8, -68,  40, -88,  62,  17, -81,  77,  -8, -68,
+ 87, -33, -48,  88, -55, -25,  85, -73,  87, -33, -48,  88, -55, -25,  85, -73,
+ 33, -81,  85, -40, -25,  77, -87,  48,  33, -81,  85, -40, -25,  77, -87,  48,  // 24
+ 17, -73,  88, -55,  -8,  68, -88,  62,  17, -73,  88, -55,  -8,  68, -88,  62,
+ 25, -68,  88, -81,  48,   0, -48,  81,  25, -68,  88, -81,  48,   0, -48,  81,
+-88,  68, -25, -25,  68, -88,  81, -48, -88,  68, -25, -25,  68, -88,  81, -48,
+ 17, -48,  73, -87,  88, -77,  55, -25,  17, -48,  73, -87,  88, -77,  55, -25,
+ -8,  40, -68,  85, -88,  81, -62,  33,  -8,  40, -68,  85, -88,  81, -62,  33,
+  8, -25,  40, -55,  68, -77,  85, -88,   8, -25,  40, -55,  68, -77,  85, -88,
+ 88, -87,  81, -73,  62, -48,  33, -17,  88, -87,  81, -73,  62, -48,  33, -17,
+};
+
+
+ALIGNED(32) const int16_t  fi_dct2_4x32_coeff_hor[128] = {
+ 64,  83,  64,  83,  64,  83,  64,  83,  64,  83,  64,  83,  64,  83,  64,  83,
+ 64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,
+ 64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,
+ 64, -83,  64, -83,  64, -83,  64, -83,  64, -83,  64, -83,  64, -83,  64, -83,
+ 64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,
+-64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83,
+-64,  83, -64,  83, -64,  83, -64,  83, -64,  83, -64,  83, -64,  83, -64,  83,
+ 64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,
+};
+
+ALIGNED(32) const int16_t  fi_dst7_4x32_coeff_hor[128] = {
+ 29,  74,  29,  74,  29,  74,  29,  74,  29,  74,  29,  74,  29,  74,  29,  74,
+ 55,  74,  55,  74,  55,  74,  55,  74,  55,  74,  55,  74,  55,  74,  55,  74,
+ 74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,
+ 84, -74,  84, -74,  84, -74,  84, -74,  84, -74,  84, -74,  84, -74,  84, -74,
+ 84,  55,  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,
+-29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84,
+-74,  74, -74,  74, -74,  74, -74,  74, -74,  74, -74,  74, -74,  74, -74,  74,
+ 55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,
+};
+
+ALIGNED(32) const int16_t  fi_dct8_4x32_coeff_hor[128] = { 
+ 84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,
+ 74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,
+ 55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,
+ 29, -74,  29, -74,  29, -74,  29, -74,  29, -74,  29, -74,  29, -74,  29, -74,
+ 55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,
+-74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74,
+-29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84,
+ 84, -55,  84, -55,  84, -55,  84, -55,  84, -55,  84, -55,  84, -55,  84, -55,
+};
+
+
+// 8xN
+ALIGNED(32) const int16_t  ff_dct2_8xN_coeff_hor[128] = {
+ 64,  64,  89,  75,  83,  36,  75, -18,  64,  64,  89,  75,  83,  36,  75, -18,
+ 64,  64,  50,  18, -36, -83, -89, -50,  64,  64,  50,  18, -36, -83, -89, -50,
+ 64,  64, -18, -50, -83, -36,  50,  89,  64,  64, -18, -50, -83, -36,  50,  89,
+ 64,  64, -75, -89,  36,  83,  18, -75,  64,  64, -75, -89,  36,  83,  18, -75,
+ 64, -64,  50, -89,  36, -83,  18, -50,  64, -64,  50, -89,  36, -83,  18, -50,
+-64,  64,  18,  75,  83, -36,  75, -89, -64,  64,  18,  75,  83, -36,  75, -89,
+ 64, -64, -75, -18, -36,  83,  89, -75,  64, -64, -75, -18, -36,  83,  89, -75,
+-64,  64,  89, -50, -83,  36,  50, -18, -64,  64,  89, -50, -83,  36,  50, -18
+};
+
+ALIGNED(32) const int16_t  ff_dst7_8xN_coeff_hor[128] = {
+ 17,  32,  46,  78,  71,  85,  85,  46,  17,  32,  46,  78,  71,  85,  85,  46,
+ 46,  60,  86,  71,  32, -46, -60, -78,  46,  60,  86,  71,  32, -46, -60, -78,
+ 71,  78,  32, -17, -86, -60,  17,  86,  71,  78,  32, -17, -86, -60,  17,  86,
+ 85,  86, -60, -85,  17,  78,  32, -71,  85,  86, -60, -85,  17,  78,  32, -71,
+ 86, -17,  78, -71,  60, -86,  32, -60,  86, -17,  78, -71,  60, -86,  32, -60,
+-85,  32, -17,  85,  71, -17,  78, -86, -85,  32, -17,  85,  71, -17,  78, -86,
+ 78, -46, -60, -32, -46,  85,  85, -71,  78, -46, -60, -32, -46,  85,  85, -71,
+-71,  60,  86, -46, -78,  32,  46, -17, -71,  60,  86, -46, -78,  32,  46, -17,
+};
+
+ALIGNED(32) const int16_t  ff_dct8_8xN_coeff_hor[128] = {
+ 86,  85,  85,  60,  78,  17,  71, -32,  86,  85,  85,  60,  78,  17,  71, -32,
+ 78,  71,  17, -32, -60, -86, -86, -17,  78,  71,  17, -32, -60, -86, -86, -17,
+ 60,  46, -71, -86, -46,  32,  78,  60,  60,  46, -71, -86, -46,  32,  78,  60,
+ 32,  17, -78, -46,  85,  71, -46, -85,  32,  17, -78, -46,  85,  71, -46, -85,
+ 60, -71,  46, -86,  32, -78,  17, -46,  60, -71,  46, -86,  32, -78,  17, -46,
+-46,  78,  32,  60,  85, -46,  71, -85, -46,  78,  32,  60,  85, -46,  71, -85,
+ 32, -85, -85,  17, -17,  71,  86, -78,  32, -85, -85,  17, -17,  71,  86, -78,
+-17,  86,  71, -78, -86,  60,  60, -32, -17,  86,  71, -78, -86,  60,  60, -32,
+};
+
+
+            const int16_t* ff_dct2_8x2_coeff_ver = ff_dct2_2xN_coeff_hor; // This is identical to existing table
+
+
+ALIGNED(32) const int16_t  fi_dct2_8x2_coeff_hor[128] = {
+ 64,  89,  83,  75,  64,  50,  36,  18,  64,  89,  83,  75,  64,  50,  36,  18,
+ 64,  75,  36, -18, -64, -89, -83, -50,  64,  75,  36, -18, -64, -89, -83, -50,
+ 64,  50, -36, -89, -64,  18,  83,  75,  64,  50, -36, -89, -64,  18,  83,  75,
+ 64,  18, -83, -50,  64,  75, -36, -89,  64,  18, -83, -50,  64,  75, -36, -89,
+ 64, -18, -83,  50,  64, -75, -36,  89,  64, -18, -83,  50,  64, -75, -36,  89,
+ 64, -50, -36,  89, -64, -18,  83, -75,  64, -50, -36,  89, -64, -18,  83, -75,
+ 64, -75,  36,  18, -64,  89, -83,  50,  64, -75,  36,  18, -64,  89, -83,  50,
+ 64, -89,  83, -75,  64, -50,  36, -18,  64, -89,  83, -75,  64, -50,  36, -18,
+};
+
+            const int16_t* fi_dct2_8x2_coeff_ver = ff_dct2_2xN_coeff_hor; // This is identical to existing table
+
+
+ALIGNED(32) const int16_t  ff_dct2_8x4_coeff_ver[128] = {
+ 64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,
+ 64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,
+ 83,  36,  83,  36,  83,  36,  83,  36,  83,  36,  83,  36,  83,  36,  83,  36,
+-36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83,
+ 64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,
+-64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64,
+ 36, -83,  36, -83,  36, -83,  36, -83,  36, -83,  36, -83,  36, -83,  36, -83,
+ 83, -36,  83, -36,  83, -36,  83, -36,  83, -36,  83, -36,  83, -36,  83, -36,
+};
+
+ALIGNED(32) const int16_t  ff_dst7_8x4_coeff_ver[128] = {
+ 29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,
+ 74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,
+ 74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,
+  0, -74,   0, -74,   0, -74,   0, -74,   0, -74,   0, -74,   0, -74,   0, -74,
+ 84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,
+-74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55,
+ 55, -84,  55, -84,  55, -84,  55, -84,  55, -84,  55, -84,  55, -84,  55, -84,
+ 74, -29,  74, -29,  74, -29,  74, -29,  74, -29,  74, -29,  74, -29,  74, -29,
+};
+
+ALIGNED(32) const int16_t  ff_dct8_8x4_coeff_ver[128] = {
+ 84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,
+ 55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,
+ 74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,
+-74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74,
+ 55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,
+-29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84,
+ 29, -74,  29, -74,  29, -74,  29, -74,  29, -74,  29, -74,  29, -74,  29, -74,
+ 84, -55,  84, -55,  84, -55,  84, -55,  84, -55,  84, -55,  84, -55,  84, -55,
+};
+
+
+ALIGNED(32) const int16_t  fi_dct2_8x4_coeff_hor[256] = {
+ 64,  89,  83,  75,  64,  89,  83,  75,  64,  89,  83,  75,  64,  89,  83,  75,  // 0
+ 64,  50,  36,  18,  64,  50,  36,  18,  64,  50,  36,  18,  64,  50,  36,  18,
+ 64,  75,  36, -18,  64,  75,  36, -18,  64,  75,  36, -18,  64,  75,  36, -18,
+-64, -89, -83, -50, -64, -89, -83, -50, -64, -89, -83, -50, -64, -89, -83, -50,
+ 64,  50, -36, -89,  64,  50, -36, -89,  64,  50, -36, -89,  64,  50, -36, -89,
+-64,  18,  83,  75, -64,  18,  83,  75, -64,  18,  83,  75, -64,  18,  83,  75,
+ 64,  18, -83, -50,  64,  18, -83, -50,  64,  18, -83, -50,  64,  18, -83, -50,
+ 64,  75, -36, -89,  64,  75, -36, -89,  64,  75, -36, -89,  64,  75, -36, -89,
+ 64, -18, -83,  50,  64, -18, -83,  50,  64, -18, -83,  50,  64, -18, -83,  50,  // 8
+ 64, -75, -36,  89,  64, -75, -36,  89,  64, -75, -36,  89,  64, -75, -36,  89,
+ 64, -50, -36,  89,  64, -50, -36,  89,  64, -50, -36,  89,  64, -50, -36,  89,
+-64, -18,  83, -75, -64, -18,  83, -75, -64, -18,  83, -75, -64, -18,  83, -75,
+ 64, -75,  36,  18,  64, -75,  36,  18,  64, -75,  36,  18,  64, -75,  36,  18,
+-64,  89, -83,  50, -64,  89, -83,  50, -64,  89, -83,  50, -64,  89, -83,  50,
+ 64, -89,  83, -75,  64, -89,  83, -75,  64, -89,  83, -75,  64, -89,  83, -75,
+ 64, -50,  36, -18,  64, -50,  36, -18,  64, -50,  36, -18,  64, -50,  36, -18,
+};
+
+ALIGNED(32) const int16_t  fi_dst7_8x4_coeff_hor[256] = {
+ 17,  46,  71,  85,  17,  46,  71,  85,  17,  46,  71,  85,  17,  46,  71,  85,  // 0
+ 86,  78,  60,  32,  86,  78,  60,  32,  86,  78,  60,  32,  86,  78,  60,  32,
+ 32,  78,  85,  46,  32,  78,  85,  46,  32,  78,  85,  46,  32,  78,  85,  46,
+-17, -71, -86, -60, -17, -71, -86, -60, -17, -71, -86, -60, -17, -71, -86, -60,
+ 46,  86,  32, -60,  46,  86,  32, -60,  46,  86,  32, -60,  46,  86,  32, -60,
+-85, -17,  71,  78, -85, -17,  71,  78, -85, -17,  71,  78, -85, -17,  71,  78,
+ 60,  71, -46, -78,  60,  71, -46, -78,  60,  71, -46, -78,  60,  71, -46, -78,
+ 32,  85, -17, -86,  32,  85, -17, -86,  32,  85, -17, -86,  32,  85, -17, -86,
+ 71,  32, -86,  17,  71,  32, -86,  17,  71,  32, -86,  17,  71,  32, -86,  17,  // 8
+ 78, -60, -46,  85,  78, -60, -46,  85,  78, -60, -46,  85,  78, -60, -46,  85,
+ 78, -17, -60,  86,  78, -17, -60,  86,  78, -17, -60,  86,  78, -17, -60,  86,
+-46, -32,  85, -71, -46, -32,  85, -71, -46, -32,  85, -71, -46, -32,  85, -71,
+ 85, -60,  17,  32,  85, -60,  17,  32,  85, -60,  17,  32,  85, -60,  17,  32,
+-71,  86, -78,  46, -71,  86, -78,  46, -71,  86, -78,  46, -71,  86, -78,  46,
+ 86, -85,  78, -71,  86, -85,  78, -71,  86, -85,  78, -71,  86, -85,  78, -71,
+ 60, -46,  32, -17,  60, -46,  32, -17,  60, -46,  32, -17,  60, -46,  32, -17,
+};
+
+ALIGNED(32) const int16_t  fi_dct8_8x4_coeff_hor[256] = {
+ 86,  85,  78,  71,  86,  85,  78,  71,  86,  85,  78,  71,  86,  85,  78,  71,  // 0
+ 60,  46,  32,  17,  60,  46,  32,  17,  60,  46,  32,  17,  60,  46,  32,  17,
+ 85,  60,  17, -32,  85,  60,  17, -32,  85,  60,  17, -32,  85,  60,  17, -32,
+-71, -86, -78, -46, -71, -86, -78, -46, -71, -86, -78, -46, -71, -86, -78, -46,
+ 78,  17, -60, -86,  78,  17, -60, -86,  78,  17, -60, -86,  78,  17, -60, -86,
+-46,  32,  85,  71, -46,  32,  85,  71, -46,  32,  85,  71, -46,  32,  85,  71,
+ 71, -32, -86, -17,  71, -32, -86, -17,  71, -32, -86, -17,  71, -32, -86, -17,
+ 78,  60, -46, -85,  78,  60, -46, -85,  78,  60, -46, -85,  78,  60, -46, -85,
+ 60, -71, -46,  78,  60, -71, -46,  78,  60, -71, -46,  78,  60, -71, -46,  78,  // 8
+ 32, -85, -17,  86,  32, -85, -17,  86,  32, -85, -17,  86,  32, -85, -17,  86,
+ 46, -86,  32,  60,  46, -86,  32,  60,  46, -86,  32,  60,  46, -86,  32,  60,
+-85,  17,  71, -78, -85,  17,  71, -78, -85,  17,  71, -78, -85,  17,  71, -78,
+ 32, -78,  85, -46,  32, -78,  85, -46,  32, -78,  85, -46,  32, -78,  85, -46,
+-17,  71, -86,  60, -17,  71, -86,  60, -17,  71, -86,  60, -17,  71, -86,  60,
+ 17, -46,  71, -85,  17, -46,  71, -85,  17, -46,  71, -85,  17, -46,  71, -85,
+ 86, -78,  60, -32,  86, -78,  60, -32,  86, -78,  60, -32,  86, -78,  60, -32,
+};
+
+
+ALIGNED(32) const int16_t  fi_dct2_8x4_coeff_ver[128] = {
+ 64,  83,  64,  83,  64,  83,  64,  83,  64,  83,  64,  83,  64,  83,  64,  83,
+ 64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,
+ 64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,
+-64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83,
+ 64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,
+-64,  83, -64,  83, -64,  83, -64,  83, -64,  83, -64,  83, -64,  83, -64,  83,
+ 64, -83,  64, -83,  64, -83,  64, -83,  64, -83,  64, -83,  64, -83,  64, -83,
+ 64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,
+};
+
+ALIGNED(32) const int16_t  fi_dst7_8x4_coeff_ver[128] = {
+ 29,  74,  29,  74,  29,  74,  29,  74,  29,  74,  29,  74,  29,  74,  29,  74,
+ 84,  55,  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,
+ 55,  74,  55,  74,  55,  74,  55,  74,  55,  74,  55,  74,  55,  74,  55,  74,
+-29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84,
+ 74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,
+-74,  74, -74,  74, -74,  74, -74,  74, -74,  74, -74,  74, -74,  74, -74,  74,
+ 84, -74,  84, -74,  84, -74,  84, -74,  84, -74,  84, -74,  84, -74,  84, -74,
+ 55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,
+};
+
+            const int16_t* fi_dct8_8x4_coeff_ver      = ff_dct8_8x4_coeff_ver; // Duplicate table
+
+
+ALIGNED(32) const int16_t  ff_dct2_8x8_coeff_ver[64] = {
+ 64,  64,  64,  64,  64,  64,  64,  64,  89,  50,  75,  18, -18, -75, -50, -89,
+ 83, -36,  36, -83, -83,  36, -36,  83,  75, -89, -18, -50,  50,  18,  89, -75,
+ 64, -64, -64,  64,  64, -64, -64,  64,  50,  18, -89,  75, -75,  89, -18, -50,
+ 36,  83, -83, -36, -36, -83,  83,  36,  18,  75, -50, -89,  89,  50, -75, -18,
+};
+
+ALIGNED(32) const int16_t  ff_dst7_8x8_coeff_ver[64] = {
+ 17,  46,  32,  60,  71,  85,  78,  86,  46,  86,  78,  71,  32, -60, -17, -85,
+ 71,  32,  85, -46, -86,  17, -60,  78,  85, -60,  46, -78,  17,  32,  86, -71,
+ 86, -85, -17,  32,  78, -71, -46,  60,  78, -17, -71,  85, -60,  86, -32, -46,
+ 60,  71, -86, -17, -46, -78,  85,  32,  32,  78, -60, -86,  85,  46, -71, -17,
+};
+
+ALIGNED(32) const int16_t  ff_dct8_8x8_coeff_ver[64] = {
+ 86,  78,  85,  71,  60,  32,  46,  17,  85,  17,  60, -32, -71, -78, -86, -46,
+ 78, -60,  17, -86, -46,  85,  32,  71,  71, -86, -32, -17,  78, -46,  60, -85,
+ 60, -46, -71,  78,  32, -17, -85,  86,  46,  32, -86,  60, -85,  71,  17, -78,
+ 32,  85, -78, -46, -17, -86,  71,  60,  17,  71, -46, -85,  86,  60, -78, -32,
+};
+
+
+ALIGNED(32) const int16_t  fi_dct2_8x8_coeff_hor[512] = {
+ 64,  89,  64,  89,  64,  89,  64,  89,  64,  89,  64,  89,  64,  89,  64,  89,  // 0
+ 83,  75,  83,  75,  83,  75,  83,  75,  83,  75,  83,  75,  83,  75,  83,  75,
+ 64,  50,  64,  50,  64,  50,  64,  50,  64,  50,  64,  50,  64,  50,  64,  50,
+ 36,  18,  36,  18,  36,  18,  36,  18,  36,  18,  36,  18,  36,  18,  36,  18,
+ 64,  75,  64,  75,  64,  75,  64,  75,  64,  75,  64,  75,  64,  75,  64,  75,
+ 36, -18,  36, -18,  36, -18,  36, -18,  36, -18,  36, -18,  36, -18,  36, -18,
+-64, -89, -64, -89, -64, -89, -64, -89, -64, -89, -64, -89, -64, -89, -64, -89,
+-83, -50, -83, -50, -83, -50, -83, -50, -83, -50, -83, -50, -83, -50, -83, -50,
+ 64,  50,  64,  50,  64,  50,  64,  50,  64,  50,  64,  50,  64,  50,  64,  50,  // 8
+-36, -89, -36, -89, -36, -89, -36, -89, -36, -89, -36, -89, -36, -89, -36, -89,
+-64,  18, -64,  18, -64,  18, -64,  18, -64,  18, -64,  18, -64,  18, -64,  18,
+ 83,  75,  83,  75,  83,  75,  83,  75,  83,  75,  83,  75,  83,  75,  83,  75,
+ 64,  18,  64,  18,  64,  18,  64,  18,  64,  18,  64,  18,  64,  18,  64,  18,
+-83, -50, -83, -50, -83, -50, -83, -50, -83, -50, -83, -50, -83, -50, -83, -50,
+ 64,  75,  64,  75,  64,  75,  64,  75,  64,  75,  64,  75,  64,  75,  64,  75,
+-36, -89, -36, -89, -36, -89, -36, -89, -36, -89, -36, -89, -36, -89, -36, -89,
+ 64, -18,  64, -18,  64, -18,  64, -18,  64, -18,  64, -18,  64, -18,  64, -18,  // 16
+-83,  50, -83,  50, -83,  50, -83,  50, -83,  50, -83,  50, -83,  50, -83,  50,
+ 64, -75,  64, -75,  64, -75,  64, -75,  64, -75,  64, -75,  64, -75,  64, -75,
+-36,  89, -36,  89, -36,  89, -36,  89, -36,  89, -36,  89, -36,  89, -36,  89,
+ 64, -50,  64, -50,  64, -50,  64, -50,  64, -50,  64, -50,  64, -50,  64, -50,
+-36,  89, -36,  89, -36,  89, -36,  89, -36,  89, -36,  89, -36,  89, -36,  89,
+-64, -18, -64, -18, -64, -18, -64, -18, -64, -18, -64, -18, -64, -18, -64, -18,
+ 83, -75,  83, -75,  83, -75,  83, -75,  83, -75,  83, -75,  83, -75,  83, -75,
+ 64, -75,  64, -75,  64, -75,  64, -75,  64, -75,  64, -75,  64, -75,  64, -75,  // 24
+ 36,  18,  36,  18,  36,  18,  36,  18,  36,  18,  36,  18,  36,  18,  36,  18,
+-64,  89, -64,  89, -64,  89, -64,  89, -64,  89, -64,  89, -64,  89, -64,  89,
+-83,  50, -83,  50, -83,  50, -83,  50, -83,  50, -83,  50, -83,  50, -83,  50,
+ 64, -89,  64, -89,  64, -89,  64, -89,  64, -89,  64, -89,  64, -89,  64, -89,
+ 83, -75,  83, -75,  83, -75,  83, -75,  83, -75,  83, -75,  83, -75,  83, -75,
+ 64, -50,  64, -50,  64, -50,  64, -50,  64, -50,  64, -50,  64, -50,  64, -50,
+ 36, -18,  36, -18,  36, -18,  36, -18,  36, -18,  36, -18,  36, -18,  36, -18,
+};
+
+ALIGNED(32) const int16_t  fi_dst7_8x8_coeff_hor[512] = {
+ 17,  46,  17,  46,  17,  46,  17,  46,  17,  46,  17,  46,  17,  46,  17,  46,  // 0
+ 71,  85,  71,  85,  71,  85,  71,  85,  71,  85,  71,  85,  71,  85,  71,  85,
+ 86,  78,  86,  78,  86,  78,  86,  78,  86,  78,  86,  78,  86,  78,  86,  78,
+ 60,  32,  60,  32,  60,  32,  60,  32,  60,  32,  60,  32,  60,  32,  60,  32,
+ 32,  78,  32,  78,  32,  78,  32,  78,  32,  78,  32,  78,  32,  78,  32,  78,
+ 85,  46,  85,  46,  85,  46,  85,  46,  85,  46,  85,  46,  85,  46,  85,  46,
+-17, -71, -17, -71, -17, -71, -17, -71, -17, -71, -17, -71, -17, -71, -17, -71,
+-86, -60, -86, -60, -86, -60, -86, -60, -86, -60, -86, -60, -86, -60, -86, -60,
+ 46,  86,  46,  86,  46,  86,  46,  86,  46,  86,  46,  86,  46,  86,  46,  86,  // 8
+ 32, -60,  32, -60,  32, -60,  32, -60,  32, -60,  32, -60,  32, -60,  32, -60,
+-85, -17, -85, -17, -85, -17, -85, -17, -85, -17, -85, -17, -85, -17, -85, -17,
+ 71,  78,  71,  78,  71,  78,  71,  78,  71,  78,  71,  78,  71,  78,  71,  78,
+ 60,  71,  60,  71,  60,  71,  60,  71,  60,  71,  60,  71,  60,  71,  60,  71,
+-46, -78, -46, -78, -46, -78, -46, -78, -46, -78, -46, -78, -46, -78, -46, -78,
+ 32,  85,  32,  85,  32,  85,  32,  85,  32,  85,  32,  85,  32,  85,  32,  85,
+-17, -86, -17, -86, -17, -86, -17, -86, -17, -86, -17, -86, -17, -86, -17, -86,
+ 71,  32,  71,  32,  71,  32,  71,  32,  71,  32,  71,  32,  71,  32,  71,  32,  // 16
+-86,  17, -86,  17, -86,  17, -86,  17, -86,  17, -86,  17, -86,  17, -86,  17,
+ 78, -60,  78, -60,  78, -60,  78, -60,  78, -60,  78, -60,  78, -60,  78, -60,
+-46,  85, -46,  85, -46,  85, -46,  85, -46,  85, -46,  85, -46,  85, -46,  85,
+ 78, -17,  78, -17,  78, -17,  78, -17,  78, -17,  78, -17,  78, -17,  78, -17,
+-60,  86, -60,  86, -60,  86, -60,  86, -60,  86, -60,  86, -60,  86, -60,  86,
+-46, -32, -46, -32, -46, -32, -46, -32, -46, -32, -46, -32, -46, -32, -46, -32,
+ 85, -71,  85, -71,  85, -71,  85, -71,  85, -71,  85, -71,  85, -71,  85, -71,
+ 85, -60,  85, -60,  85, -60,  85, -60,  85, -60,  85, -60,  85, -60,  85, -60,  // 24
+ 17,  32,  17,  32,  17,  32,  17,  32,  17,  32,  17,  32,  17,  32,  17,  32,
+-71,  86, -71,  86, -71,  86, -71,  86, -71,  86, -71,  86, -71,  86, -71,  86,
+-78,  46, -78,  46, -78,  46, -78,  46, -78,  46, -78,  46, -78,  46, -78,  46,
+ 86, -85,  86, -85,  86, -85,  86, -85,  86, -85,  86, -85,  86, -85,  86, -85,
+ 78, -71,  78, -71,  78, -71,  78, -71,  78, -71,  78, -71,  78, -71,  78, -71,
+ 60, -46,  60, -46,  60, -46,  60, -46,  60, -46,  60, -46,  60, -46,  60, -46,
+ 32, -17,  32, -17,  32, -17,  32, -17,  32, -17,  32, -17,  32, -17,  32, -17,
+};
+
+ALIGNED(32) const int16_t  fi_dct8_8x8_coeff_hor[512] = {
+ 86,  85,  86,  85,  86,  85,  86,  85,  86,  85,  86,  85,  86,  85,  86,  85,  // 0
+ 78,  71,  78,  71,  78,  71,  78,  71,  78,  71,  78,  71,  78,  71,  78,  71,
+ 60,  46,  60,  46,  60,  46,  60,  46,  60,  46,  60,  46,  60,  46,  60,  46,
+ 32,  17,  32,  17,  32,  17,  32,  17,  32,  17,  32,  17,  32,  17,  32,  17,
+ 85,  60,  85,  60,  85,  60,  85,  60,  85,  60,  85,  60,  85,  60,  85,  60,
+ 17, -32,  17, -32,  17, -32,  17, -32,  17, -32,  17, -32,  17, -32,  17, -32,
+-71, -86, -71, -86, -71, -86, -71, -86, -71, -86, -71, -86, -71, -86, -71, -86,
+-78, -46, -78, -46, -78, -46, -78, -46, -78, -46, -78, -46, -78, -46, -78, -46,
+ 78,  17,  78,  17,  78,  17,  78,  17,  78,  17,  78,  17,  78,  17,  78,  17,  // 8
+-60, -86, -60, -86, -60, -86, -60, -86, -60, -86, -60, -86, -60, -86, -60, -86,
+-46,  32, -46,  32, -46,  32, -46,  32, -46,  32, -46,  32, -46,  32, -46,  32,
+ 85,  71,  85,  71,  85,  71,  85,  71,  85,  71,  85,  71,  85,  71,  85,  71,
+ 71, -32,  71, -32,  71, -32,  71, -32,  71, -32,  71, -32,  71, -32,  71, -32,
+-86, -17, -86, -17, -86, -17, -86, -17, -86, -17, -86, -17, -86, -17, -86, -17,
+ 78,  60,  78,  60,  78,  60,  78,  60,  78,  60,  78,  60,  78,  60,  78,  60,
+-46, -85, -46, -85, -46, -85, -46, -85, -46, -85, -46, -85, -46, -85, -46, -85,
+ 60, -71,  60, -71,  60, -71,  60, -71,  60, -71,  60, -71,  60, -71,  60, -71,  // 16
+-46,  78, -46,  78, -46,  78, -46,  78, -46,  78, -46,  78, -46,  78, -46,  78,
+ 32, -85,  32, -85,  32, -85,  32, -85,  32, -85,  32, -85,  32, -85,  32, -85,
+-17,  86, -17,  86, -17,  86, -17,  86, -17,  86, -17,  86, -17,  86, -17,  86,
+ 46, -86,  46, -86,  46, -86,  46, -86,  46, -86,  46, -86,  46, -86,  46, -86,
+ 32,  60,  32,  60,  32,  60,  32,  60,  32,  60,  32,  60,  32,  60,  32,  60,
+-85,  17, -85,  17, -85,  17, -85,  17, -85,  17, -85,  17, -85,  17, -85,  17,
+ 71, -78,  71, -78,  71, -78,  71, -78,  71, -78,  71, -78,  71, -78,  71, -78,
+ 32, -78,  32, -78,  32, -78,  32, -78,  32, -78,  32, -78,  32, -78,  32, -78,  // 24
+ 85, -46,  85, -46,  85, -46,  85, -46,  85, -46,  85, -46,  85, -46,  85, -46,
+-17,  71, -17,  71, -17,  71, -17,  71, -17,  71, -17,  71, -17,  71, -17,  71,
+-86,  60, -86,  60, -86,  60, -86,  60, -86,  60, -86,  60, -86,  60, -86,  60,
+ 17, -46,  17, -46,  17, -46,  17, -46,  17, -46,  17, -46,  17, -46,  17, -46,
+ 71, -85,  71, -85,  71, -85,  71, -85,  71, -85,  71, -85,  71, -85,  71, -85,
+ 86, -78,  86, -78,  86, -78,  86, -78,  86, -78,  86, -78,  86, -78,  86, -78,
+ 60, -32,  60, -32,  60, -32,  60, -32,  60, -32,  60, -32,  60, -32,  60, -32,
+};
+
+
+ALIGNED(32) const int16_t  ff_dct2_8x16_coeff_ver[256] = {
+ 64,  64,  90,  87,  89,  75,  87,  57,  83,  36,  80,   9,  75, -18,  70, -43,  // 0
+ 64, -64,  57, -80,  50, -89,  43, -90,  36, -83,  25, -70,  18, -50,   9, -25,
+ 64,  64,  80,  70,  50,  18,   9, -43, -36, -83, -70, -87, -89, -50, -87,   9,
+-64,  64, -25,  90,  18,  75,  57,  25,  83, -36,  90, -80,  75, -89,  43, -57,
+ 64,  64,  57,  43, -18, -50, -80, -90, -83, -36, -25,  57,  50,  89,  90,  25,
+ 64, -64,  -9, -87, -75, -18, -87,  70, -36,  83,  43,   9,  89, -75,  70, -80,
+ 64,  64,  25,   9, -75, -89, -70, -25,  36,  83,  90,  43,  18, -75, -80, -57,
+-64,  64,  43,  70,  89, -50,   9, -80, -83,  36, -57,  87,  50, -18,  87, -90,
+ 64,  64,  -9, -25, -89, -75,  25,  70,  83,  36, -43, -90, -75,  18,  57,  80,  // 8
+ 64, -64, -70, -43, -50,  89,  80,  -9,  36, -83, -87,  57, -18,  50,  90, -87,
+ 64,  64, -43, -57, -50, -18,  90,  80, -36, -83, -57,  25,  89,  50, -25, -90,
+-64,  64,  87,   9, -18, -75, -70,  87,  83, -36,  -9, -43, -75,  89,  80, -70,
+ 64,  64, -70, -80,  18,  50,  43,  -9, -83, -36,  87,  70, -50, -89,  -9,  87,
+ 64, -64, -90,  25,  75,  18, -25, -57, -36,  83,  80, -90, -89,  75,  57, -43,
+ 64,  64, -87, -90,  75,  89, -57, -87,  36,  83,  -9, -80, -18,  75,  43, -70,
+-64,  64,  80, -57, -89,  50,  90, -43, -83,  36,  70, -25, -50,  18,  25,  -9,
+};
+
+ALIGNED(32) const int16_t  ff_dst7_8x16_coeff_ver[256] = {
+  8,  17,  25,  48,  40,  73,  55,  87,  68,  88,  77,  77,  85,  55,  88,  25,  // 0
+ 88,  -8,  87, -40,  81, -68,  73, -85,  62, -88,  48, -81,  33, -62,  17, -33,
+ 25,  33,  68,  81,  88,  85,  81,  40,  48, -25,   0, -77, -48, -87, -81, -48,
+-88,  17, -68,  73, -25,  88,  25,  55,  68,  -8,  88, -68,  81, -88,  48, -62,
+ 40,  48,  88,  88,  62,  25, -17, -68, -81, -81, -77,   0,  -8,  81,  68,  68,
+ 87, -25,  33, -88, -48, -48, -88,  48, -55,  88,  25,  25,  85, -68,  73, -81,
+ 55,  62,  81,  68, -17, -55, -88, -73, -25,  48,  77,  77,  62, -40, -48, -81,
+-85,  33,   8,  85,  88, -25,  33, -87, -73,  17, -68,  88,  40,  -8,  87, -88,
+ 68,  73,  48,  25, -81, -88, -25,  33,  88,  68,   0, -77, -88, -17,  25,  88,  // 8
+ 81, -40, -48, -62, -68,  81,  68,   8,  48, -87, -81,  48, -25,  55,  88, -85,
+ 77,  81,   0, -25, -77, -48,  77,  88,   0, -68, -77,   0,  77,  68,   0, -88,
+-77,  48,  77,  25,   0, -81, -77,  81,  77, -25,   0, -48, -77,  88,  77, -68,
+ 85,  87, -48, -68,  -8,  33,  62,   8, -88, -48,  77,  77, -33, -88, -25,  81,
+ 73, -55, -88,  17,  68,  25, -17, -62, -40,  85,  81, -88, -87,  73,  55, -40,
+ 88,  88, -81, -88,  68,  87, -48, -85,  25,  81,   0, -77, -25,  73,  48, -68,
+-68,  62,  81, -55, -88,  48,  88, -40, -81,  33,  68, -25, -48,  17,  25,  -8,
+};
+
+ALIGNED(32) const int16_t  ff_dct8_8x16_coeff_ver[256] = {
+ 88,  88,  88,  81,  87,  68,  85,  48,  81,  25,  77,   0,  73, -25,  68, -48,  // 0
+ 62, -68,  55, -81,  48, -88,  40, -88,  33, -81,  25, -68,  17, -48,   8, -25,
+ 87,  85,  68,  48,  33,  -8,  -8, -62, -48, -88, -77, -77, -88, -33, -81,  25,
+-55,  73, -17,  88,  25,  68,  62,  17,  85, -40,  88, -81,  73, -87,  40, -55,
+ 81,  77,  25,   0, -48, -77, -88, -77, -68,   0,   0,  77,  68,  77,  88,   0,
+ 48, -77, -25, -77, -81,   0, -81,  77, -25,  77,  48,   0,  88, -77,  68, -77,
+ 73,  68, -25, -48, -88, -81, -33,  25,  68,  88,  77,   0, -17, -88, -88, -25,
+-40,  81,  62,  48,  81, -68,  -8, -68, -87,  48, -48,  81,  55, -25,  85, -88,
+ 62,  55, -68, -81, -55, -17,  73,  88,  48, -25, -77, -77, -40,  62,  81,  48,  // 8
+ 33, -85, -85,  -8, -25,  88,  87, -33,  17, -73, -88,  68,  -8,  40,  88, -87,
+ 48,  40, -88, -88,  25,  62,  68,  17, -81, -81,   0,  77,  81,  -8, -68, -68,
+-25,  87,  88, -33, -48, -48, -48,  88,  88, -55, -25, -25, -68,  85,  81, -73,
+ 33,  25, -81, -68,  85,  88, -40, -81, -25,  48,  77,   0, -87, -48,  48,  81,
+ 17, -88, -73,  68,  88, -25, -55, -25,  -8,  68,  68, -88, -88,  81,  62, -48,
+ 17,   8, -48, -25,  73,  40, -87, -55,  88,  68, -77, -77,  55,  85, -25, -88,
+ -8,  88,  40, -87, -68,  81,  85, -73, -88,  62,  81, -48, -62,  33,  33, -17,
+};
+
+ALIGNED(32) const int16_t  ff_dct2_8x16_butterfly_o_row_coeff_hor[256] = {
+ 89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  // 0
+ 75, -75,  75, -75,  75, -75,  75, -75,  75, -75,  75, -75,  75, -75,  75, -75,
+ 50, -50,  50, -50,  50, -50,  50, -50,  50, -50,  50, -50,  50, -50,  50, -50,
+ 18, -18,  18, -18,  18, -18,  18, -18,  18, -18,  18, -18,  18, -18,  18, -18,
+ 75, -75,  75, -75,  75, -75,  75, -75,  75, -75,  75, -75,  75, -75,  75, -75,
+-18,  18, -18,  18, -18,  18, -18,  18, -18,  18, -18,  18, -18,  18, -18,  18,
+-89,  89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  89,
+-50,  50, -50,  50, -50,  50, -50,  50, -50,  50, -50,  50, -50,  50, -50,  50,
+ 50, -50,  50, -50,  50, -50,  50, -50,  50, -50,  50, -50,  50, -50,  50, -50,  // 8
+-89,  89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  89,
+ 18, -18,  18, -18,  18, -18,  18, -18,  18, -18,  18, -18,  18, -18,  18, -18,
+ 75, -75,  75, -75,  75, -75,  75, -75,  75, -75,  75, -75,  75, -75,  75, -75,
+ 18, -18,  18, -18,  18, -18,  18, -18,  18, -18,  18, -18,  18, -18,  18, -18,
+-50,  50, -50,  50, -50,  50, -50,  50, -50,  50, -50,  50, -50,  50, -50,  50,
+ 75, -75,  75, -75,  75, -75,  75, -75,  75, -75,  75, -75,  75, -75,  75, -75,
+-89,  89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  89,
+};
+
+
+            const int16_t* fi_dct2_8x16_coeff_hor = fi_dct2_8x8_coeff_hor;
+            
+            const int16_t* fi_dst7_8x16_coeff_hor = fi_dst7_8x8_coeff_hor;
+            
+            const int16_t* fi_dct8_8x16_coeff_hor = fi_dct8_8x8_coeff_hor;
+
+
+ALIGNED(32) const int16_t  fi_dct2_8x16_coeff_ver[2048] = {
+ 64,  90,  64,  90,  64,  90,  64,  90,  64,  90,  64,  90,  64,  90,  64,  90,  // 0
+ 89,  87,  89,  87,  89,  87,  89,  87,  89,  87,  89,  87,  89,  87,  89,  87,
+ 83,  80,  83,  80,  83,  80,  83,  80,  83,  80,  83,  80,  83,  80,  83,  80,
+ 75,  70,  75,  70,  75,  70,  75,  70,  75,  70,  75,  70,  75,  70,  75,  70,
+ 64,  57,  64,  57,  64,  57,  64,  57,  64,  57,  64,  57,  64,  57,  64,  57,
+ 50,  43,  50,  43,  50,  43,  50,  43,  50,  43,  50,  43,  50,  43,  50,  43,
+ 36,  25,  36,  25,  36,  25,  36,  25,  36,  25,  36,  25,  36,  25,  36,  25,
+ 18,   9,  18,   9,  18,   9,  18,   9,  18,   9,  18,   9,  18,   9,  18,   9,
+ 64,  87,  64,  87,  64,  87,  64,  87,  64,  87,  64,  87,  64,  87,  64,  87,  // 8
+ 75,  57,  75,  57,  75,  57,  75,  57,  75,  57,  75,  57,  75,  57,  75,  57,
+ 36,   9,  36,   9,  36,   9,  36,   9,  36,   9,  36,   9,  36,   9,  36,   9,
+-18, -43, -18, -43, -18, -43, -18, -43, -18, -43, -18, -43, -18, -43, -18, -43,
+-64, -80, -64, -80, -64, -80, -64, -80, -64, -80, -64, -80, -64, -80, -64, -80,
+-89, -90, -89, -90, -89, -90, -89, -90, -89, -90, -89, -90, -89, -90, -89, -90,
+-83, -70, -83, -70, -83, -70, -83, -70, -83, -70, -83, -70, -83, -70, -83, -70,
+-50, -25, -50, -25, -50, -25, -50, -25, -50, -25, -50, -25, -50, -25, -50, -25,
+ 64,  80,  64,  80,  64,  80,  64,  80,  64,  80,  64,  80,  64,  80,  64,  80,  // 16
+ 50,   9,  50,   9,  50,   9,  50,   9,  50,   9,  50,   9,  50,   9,  50,   9,
+-36, -70, -36, -70, -36, -70, -36, -70, -36, -70, -36, -70, -36, -70, -36, -70,
+-89, -87, -89, -87, -89, -87, -89, -87, -89, -87, -89, -87, -89, -87, -89, -87,
+-64, -25, -64, -25, -64, -25, -64, -25, -64, -25, -64, -25, -64, -25, -64, -25,
+ 18,  57,  18,  57,  18,  57,  18,  57,  18,  57,  18,  57,  18,  57,  18,  57,
+ 83,  90,  83,  90,  83,  90,  83,  90,  83,  90,  83,  90,  83,  90,  83,  90,
+ 75,  43,  75,  43,  75,  43,  75,  43,  75,  43,  75,  43,  75,  43,  75,  43,
+ 64,  70,  64,  70,  64,  70,  64,  70,  64,  70,  64,  70,  64,  70,  64,  70,  // 24
+ 18, -43,  18, -43,  18, -43,  18, -43,  18, -43,  18, -43,  18, -43,  18, -43,
+-83, -87, -83, -87, -83, -87, -83, -87, -83, -87, -83, -87, -83, -87, -83, -87,
+-50,   9, -50,   9, -50,   9, -50,   9, -50,   9, -50,   9, -50,   9, -50,   9,
+ 64,  90,  64,  90,  64,  90,  64,  90,  64,  90,  64,  90,  64,  90,  64,  90,
+ 75,  25,  75,  25,  75,  25,  75,  25,  75,  25,  75,  25,  75,  25,  75,  25,
+-36, -80, -36, -80, -36, -80, -36, -80, -36, -80, -36, -80, -36, -80, -36, -80,
+-89, -57, -89, -57, -89, -57, -89, -57, -89, -57, -89, -57, -89, -57, -89, -57,
+ 64,  57,  64,  57,  64,  57,  64,  57,  64,  57,  64,  57,  64,  57,  64,  57,  // 32
+-18, -80, -18, -80, -18, -80, -18, -80, -18, -80, -18, -80, -18, -80, -18, -80,
+-83, -25, -83, -25, -83, -25, -83, -25, -83, -25, -83, -25, -83, -25, -83, -25,
+ 50,  90,  50,  90,  50,  90,  50,  90,  50,  90,  50,  90,  50,  90,  50,  90,
+ 64,  -9,  64,  -9,  64,  -9,  64,  -9,  64,  -9,  64,  -9,  64,  -9,  64,  -9,
+-75, -87, -75, -87, -75, -87, -75, -87, -75, -87, -75, -87, -75, -87, -75, -87,
+-36,  43, -36,  43, -36,  43, -36,  43, -36,  43, -36,  43, -36,  43, -36,  43,
+ 89,  70,  89,  70,  89,  70,  89,  70,  89,  70,  89,  70,  89,  70,  89,  70,
+ 64,  43,  64,  43,  64,  43,  64,  43,  64,  43,  64,  43,  64,  43,  64,  43,  // 40
+-50, -90, -50, -90, -50, -90, -50, -90, -50, -90, -50, -90, -50, -90, -50, -90,
+-36,  57, -36,  57, -36,  57, -36,  57, -36,  57, -36,  57, -36,  57, -36,  57,
+ 89,  25,  89,  25,  89,  25,  89,  25,  89,  25,  89,  25,  89,  25,  89,  25,
+-64, -87, -64, -87, -64, -87, -64, -87, -64, -87, -64, -87, -64, -87, -64, -87,
+-18,  70, -18,  70, -18,  70, -18,  70, -18,  70, -18,  70, -18,  70, -18,  70,
+ 83,   9,  83,   9,  83,   9,  83,   9,  83,   9,  83,   9,  83,   9,  83,   9,
+-75, -80, -75, -80, -75, -80, -75, -80, -75, -80, -75, -80, -75, -80, -75, -80,
+ 64,  25,  64,  25,  64,  25,  64,  25,  64,  25,  64,  25,  64,  25,  64,  25,  // 48
+-75, -70, -75, -70, -75, -70, -75, -70, -75, -70, -75, -70, -75, -70, -75, -70,
+ 36,  90,  36,  90,  36,  90,  36,  90,  36,  90,  36,  90,  36,  90,  36,  90,
+ 18, -80,  18, -80,  18, -80,  18, -80,  18, -80,  18, -80,  18, -80,  18, -80,
+-64,  43, -64,  43, -64,  43, -64,  43, -64,  43, -64,  43, -64,  43, -64,  43,
+ 89,   9,  89,   9,  89,   9,  89,   9,  89,   9,  89,   9,  89,   9,  89,   9,
+-83, -57, -83, -57, -83, -57, -83, -57, -83, -57, -83, -57, -83, -57, -83, -57,
+ 50,  87,  50,  87,  50,  87,  50,  87,  50,  87,  50,  87,  50,  87,  50,  87,
+ 64,   9,  64,   9,  64,   9,  64,   9,  64,   9,  64,   9,  64,   9,  64,   9,  // 56
+-89, -25, -89, -25, -89, -25, -89, -25, -89, -25, -89, -25, -89, -25, -89, -25,
+ 83,  43,  83,  43,  83,  43,  83,  43,  83,  43,  83,  43,  83,  43,  83,  43,
+-75, -57, -75, -57, -75, -57, -75, -57, -75, -57, -75, -57, -75, -57, -75, -57,
+ 64,  70,  64,  70,  64,  70,  64,  70,  64,  70,  64,  70,  64,  70,  64,  70,
+-50, -80, -50, -80, -50, -80, -50, -80, -50, -80, -50, -80, -50, -80, -50, -80,
+ 36,  87,  36,  87,  36,  87,  36,  87,  36,  87,  36,  87,  36,  87,  36,  87,
+-18, -90, -18, -90, -18, -90, -18, -90, -18, -90, -18, -90, -18, -90, -18, -90,
+ 64,  -9,  64,  -9,  64,  -9,  64,  -9,  64,  -9,  64,  -9,  64,  -9,  64,  -9,  // 64
+-89,  25, -89,  25, -89,  25, -89,  25, -89,  25, -89,  25, -89,  25, -89,  25,
+ 83, -43,  83, -43,  83, -43,  83, -43,  83, -43,  83, -43,  83, -43,  83, -43,
+-75,  57, -75,  57, -75,  57, -75,  57, -75,  57, -75,  57, -75,  57, -75,  57,
+ 64, -70,  64, -70,  64, -70,  64, -70,  64, -70,  64, -70,  64, -70,  64, -70,
+-50,  80, -50,  80, -50,  80, -50,  80, -50,  80, -50,  80, -50,  80, -50,  80,
+ 36, -87,  36, -87,  36, -87,  36, -87,  36, -87,  36, -87,  36, -87,  36, -87,
+-18,  90, -18,  90, -18,  90, -18,  90, -18,  90, -18,  90, -18,  90, -18,  90,
+ 64, -25,  64, -25,  64, -25,  64, -25,  64, -25,  64, -25,  64, -25,  64, -25,  // 72
+-75,  70, -75,  70, -75,  70, -75,  70, -75,  70, -75,  70, -75,  70, -75,  70,
+ 36, -90,  36, -90,  36, -90,  36, -90,  36, -90,  36, -90,  36, -90,  36, -90,
+ 18,  80,  18,  80,  18,  80,  18,  80,  18,  80,  18,  80,  18,  80,  18,  80,
+-64, -43, -64, -43, -64, -43, -64, -43, -64, -43, -64, -43, -64, -43, -64, -43,
+ 89,  -9,  89,  -9,  89,  -9,  89,  -9,  89,  -9,  89,  -9,  89,  -9,  89,  -9,
+-83,  57, -83,  57, -83,  57, -83,  57, -83,  57, -83,  57, -83,  57, -83,  57,
+ 50, -87,  50, -87,  50, -87,  50, -87,  50, -87,  50, -87,  50, -87,  50, -87,
+ 64, -43,  64, -43,  64, -43,  64, -43,  64, -43,  64, -43,  64, -43,  64, -43,  // 80
+-50,  90, -50,  90, -50,  90, -50,  90, -50,  90, -50,  90, -50,  90, -50,  90,
+-36, -57, -36, -57, -36, -57, -36, -57, -36, -57, -36, -57, -36, -57, -36, -57,
+ 89, -25,  89, -25,  89, -25,  89, -25,  89, -25,  89, -25,  89, -25,  89, -25,
+-64,  87, -64,  87, -64,  87, -64,  87, -64,  87, -64,  87, -64,  87, -64,  87,
+-18, -70, -18, -70, -18, -70, -18, -70, -18, -70, -18, -70, -18, -70, -18, -70,
+ 83,  -9,  83,  -9,  83,  -9,  83,  -9,  83,  -9,  83,  -9,  83,  -9,  83,  -9,
+-75,  80, -75,  80, -75,  80, -75,  80, -75,  80, -75,  80, -75,  80, -75,  80,
+ 64, -57,  64, -57,  64, -57,  64, -57,  64, -57,  64, -57,  64, -57,  64, -57,  // 88
+-18,  80, -18,  80, -18,  80, -18,  80, -18,  80, -18,  80, -18,  80, -18,  80,
+-83,  25, -83,  25, -83,  25, -83,  25, -83,  25, -83,  25, -83,  25, -83,  25,
+ 50, -90,  50, -90,  50, -90,  50, -90,  50, -90,  50, -90,  50, -90,  50, -90,
+ 64,   9,  64,   9,  64,   9,  64,   9,  64,   9,  64,   9,  64,   9,  64,   9,
+-75,  87, -75,  87, -75,  87, -75,  87, -75,  87, -75,  87, -75,  87, -75,  87,
+-36, -43, -36, -43, -36, -43, -36, -43, -36, -43, -36, -43, -36, -43, -36, -43,
+ 89, -70,  89, -70,  89, -70,  89, -70,  89, -70,  89, -70,  89, -70,  89, -70,
+ 64, -70,  64, -70,  64, -70,  64, -70,  64, -70,  64, -70,  64, -70,  64, -70,  // 96
+ 18,  43,  18,  43,  18,  43,  18,  43,  18,  43,  18,  43,  18,  43,  18,  43,
+-83,  87, -83,  87, -83,  87, -83,  87, -83,  87, -83,  87, -83,  87, -83,  87,
+-50,  -9, -50,  -9, -50,  -9, -50,  -9, -50,  -9, -50,  -9, -50,  -9, -50,  -9,
+ 64, -90,  64, -90,  64, -90,  64, -90,  64, -90,  64, -90,  64, -90,  64, -90,
+ 75, -25,  75, -25,  75, -25,  75, -25,  75, -25,  75, -25,  75, -25,  75, -25,
+-36,  80, -36,  80, -36,  80, -36,  80, -36,  80, -36,  80, -36,  80, -36,  80,
+-89,  57, -89,  57, -89,  57, -89,  57, -89,  57, -89,  57, -89,  57, -89,  57,
+ 64, -80,  64, -80,  64, -80,  64, -80,  64, -80,  64, -80,  64, -80,  64, -80,  // 104
+ 50,  -9,  50,  -9,  50,  -9,  50,  -9,  50,  -9,  50,  -9,  50,  -9,  50,  -9,
+-36,  70, -36,  70, -36,  70, -36,  70, -36,  70, -36,  70, -36,  70, -36,  70,
+-89,  87, -89,  87, -89,  87, -89,  87, -89,  87, -89,  87, -89,  87, -89,  87,
+-64,  25, -64,  25, -64,  25, -64,  25, -64,  25, -64,  25, -64,  25, -64,  25,
+ 18, -57,  18, -57,  18, -57,  18, -57,  18, -57,  18, -57,  18, -57,  18, -57,
+ 83, -90,  83, -90,  83, -90,  83, -90,  83, -90,  83, -90,  83, -90,  83, -90,
+ 75, -43,  75, -43,  75, -43,  75, -43,  75, -43,  75, -43,  75, -43,  75, -43,
+ 64, -87,  64, -87,  64, -87,  64, -87,  64, -87,  64, -87,  64, -87,  64, -87,  // 112
+ 75, -57,  75, -57,  75, -57,  75, -57,  75, -57,  75, -57,  75, -57,  75, -57,
+ 36,  -9,  36,  -9,  36,  -9,  36,  -9,  36,  -9,  36,  -9,  36,  -9,  36,  -9,
+-18,  43, -18,  43, -18,  43, -18,  43, -18,  43, -18,  43, -18,  43, -18,  43,
+-64,  80, -64,  80, -64,  80, -64,  80, -64,  80, -64,  80, -64,  80, -64,  80,
+-89,  90, -89,  90, -89,  90, -89,  90, -89,  90, -89,  90, -89,  90, -89,  90,
+-83,  70, -83,  70, -83,  70, -83,  70, -83,  70, -83,  70, -83,  70, -83,  70,
+-50,  25, -50,  25, -50,  25, -50,  25, -50,  25, -50,  25, -50,  25, -50,  25,
+ 64, -90,  64, -90,  64, -90,  64, -90,  64, -90,  64, -90,  64, -90,  64, -90,  // 120
+ 89, -87,  89, -87,  89, -87,  89, -87,  89, -87,  89, -87,  89, -87,  89, -87,
+ 83, -80,  83, -80,  83, -80,  83, -80,  83, -80,  83, -80,  83, -80,  83, -80,
+ 75, -70,  75, -70,  75, -70,  75, -70,  75, -70,  75, -70,  75, -70,  75, -70,
+ 64, -57,  64, -57,  64, -57,  64, -57,  64, -57,  64, -57,  64, -57,  64, -57,
+ 50, -43,  50, -43,  50, -43,  50, -43,  50, -43,  50, -43,  50, -43,  50, -43,
+ 36, -25,  36, -25,  36, -25,  36, -25,  36, -25,  36, -25,  36, -25,  36, -25,
+ 18,  -9,  18,  -9,  18,  -9,  18,  -9,  18,  -9,  18,  -9,  18,  -9,  18,  -9,
+};
+
+ALIGNED(32) const int16_t  fi_dst7_8x16_coeff_ver[2048] = {
+  8,  25,   8,  25,   8,  25,   8,  25,   8,  25,   8,  25,   8,  25,   8,  25,  // 0
+ 40,  55,  40,  55,  40,  55,  40,  55,  40,  55,  40,  55,  40,  55,  40,  55,
+ 68,  77,  68,  77,  68,  77,  68,  77,  68,  77,  68,  77,  68,  77,  68,  77,
+ 85,  88,  85,  88,  85,  88,  85,  88,  85,  88,  85,  88,  85,  88,  85,  88,
+ 88,  87,  88,  87,  88,  87,  88,  87,  88,  87,  88,  87,  88,  87,  88,  87,
+ 81,  73,  81,  73,  81,  73,  81,  73,  81,  73,  81,  73,  81,  73,  81,  73,
+ 62,  48,  62,  48,  62,  48,  62,  48,  62,  48,  62,  48,  62,  48,  62,  48,
+ 33,  17,  33,  17,  33,  17,  33,  17,  33,  17,  33,  17,  33,  17,  33,  17,
+ 17,  48,  17,  48,  17,  48,  17,  48,  17,  48,  17,  48,  17,  48,  17,  48,  // 8
+ 73,  87,  73,  87,  73,  87,  73,  87,  73,  87,  73,  87,  73,  87,  73,  87,
+ 88,  77,  88,  77,  88,  77,  88,  77,  88,  77,  88,  77,  88,  77,  88,  77,
+ 55,  25,  55,  25,  55,  25,  55,  25,  55,  25,  55,  25,  55,  25,  55,  25,
+ -8, -40,  -8, -40,  -8, -40,  -8, -40,  -8, -40,  -8, -40,  -8, -40,  -8, -40,
+-68, -85, -68, -85, -68, -85, -68, -85, -68, -85, -68, -85, -68, -85, -68, -85,
+-88, -81, -88, -81, -88, -81, -88, -81, -88, -81, -88, -81, -88, -81, -88, -81,
+-62, -33, -62, -33, -62, -33, -62, -33, -62, -33, -62, -33, -62, -33, -62, -33,
+ 25,  68,  25,  68,  25,  68,  25,  68,  25,  68,  25,  68,  25,  68,  25,  68,  // 16
+ 88,  81,  88,  81,  88,  81,  88,  81,  88,  81,  88,  81,  88,  81,  88,  81,
+ 48,   0,  48,   0,  48,   0,  48,   0,  48,   0,  48,   0,  48,   0,  48,   0,
+-48, -81, -48, -81, -48, -81, -48, -81, -48, -81, -48, -81, -48, -81, -48, -81,
+-88, -68, -88, -68, -88, -68, -88, -68, -88, -68, -88, -68, -88, -68, -88, -68,
+-25,  25, -25,  25, -25,  25, -25,  25, -25,  25, -25,  25, -25,  25, -25,  25,
+ 68,  88,  68,  88,  68,  88,  68,  88,  68,  88,  68,  88,  68,  88,  68,  88,
+ 81,  48,  81,  48,  81,  48,  81,  48,  81,  48,  81,  48,  81,  48,  81,  48,
+ 33,  81,  33,  81,  33,  81,  33,  81,  33,  81,  33,  81,  33,  81,  33,  81,  // 24
+ 85,  40,  85,  40,  85,  40,  85,  40,  85,  40,  85,  40,  85,  40,  85,  40,
+-25, -77, -25, -77, -25, -77, -25, -77, -25, -77, -25, -77, -25, -77, -25, -77,
+-87, -48, -87, -48, -87, -48, -87, -48, -87, -48, -87, -48, -87, -48, -87, -48,
+ 17,  73,  17,  73,  17,  73,  17,  73,  17,  73,  17,  73,  17,  73,  17,  73,
+ 88,  55,  88,  55,  88,  55,  88,  55,  88,  55,  88,  55,  88,  55,  88,  55,
+ -8, -68,  -8, -68,  -8, -68,  -8, -68,  -8, -68,  -8, -68,  -8, -68,  -8, -68,
+-88, -62, -88, -62, -88, -62, -88, -62, -88, -62, -88, -62, -88, -62, -88, -62,
+ 40,  88,  40,  88,  40,  88,  40,  88,  40,  88,  40,  88,  40,  88,  40,  88,  // 32
+ 62, -17,  62, -17,  62, -17,  62, -17,  62, -17,  62, -17,  62, -17,  62, -17,
+-81, -77, -81, -77, -81, -77, -81, -77, -81, -77, -81, -77, -81, -77, -81, -77,
+ -8,  68,  -8,  68,  -8,  68,  -8,  68,  -8,  68,  -8,  68,  -8,  68,  -8,  68,
+ 87,  33,  87,  33,  87,  33,  87,  33,  87,  33,  87,  33,  87,  33,  87,  33,
+-48, -88, -48, -88, -48, -88, -48, -88, -48, -88, -48, -88, -48, -88, -48, -88,
+-55,  25, -55,  25, -55,  25, -55,  25, -55,  25, -55,  25, -55,  25, -55,  25,
+ 85,  73,  85,  73,  85,  73,  85,  73,  85,  73,  85,  73,  85,  73,  85,  73,
+ 48,  88,  48,  88,  48,  88,  48,  88,  48,  88,  48,  88,  48,  88,  48,  88,  // 40
+ 25, -68,  25, -68,  25, -68,  25, -68,  25, -68,  25, -68,  25, -68,  25, -68,
+-81,   0, -81,   0, -81,   0, -81,   0, -81,   0, -81,   0, -81,   0, -81,   0,
+ 81,  68,  81,  68,  81,  68,  81,  68,  81,  68,  81,  68,  81,  68,  81,  68,
+-25, -88, -25, -88, -25, -88, -25, -88, -25, -88, -25, -88, -25, -88, -25, -88,
+-48,  48, -48,  48, -48,  48, -48,  48, -48,  48, -48,  48, -48,  48, -48,  48,
+ 88,  25,  88,  25,  88,  25,  88,  25,  88,  25,  88,  25,  88,  25,  88,  25,
+-68, -81, -68, -81, -68, -81, -68, -81, -68, -81, -68, -81, -68, -81, -68, -81,
+ 55,  81,  55,  81,  55,  81,  55,  81,  55,  81,  55,  81,  55,  81,  55,  81,  // 48
+-17, -88, -17, -88, -17, -88, -17, -88, -17, -88, -17, -88, -17, -88, -17, -88,
+-25,  77, -25,  77, -25,  77, -25,  77, -25,  77, -25,  77, -25,  77, -25,  77,
+ 62, -48,  62, -48,  62, -48,  62, -48,  62, -48,  62, -48,  62, -48,  62, -48,
+-85,   8, -85,   8, -85,   8, -85,   8, -85,   8, -85,   8, -85,   8, -85,   8,
+ 88,  33,  88,  33,  88,  33,  88,  33,  88,  33,  88,  33,  88,  33,  88,  33,
+-73, -68, -73, -68, -73, -68, -73, -68, -73, -68, -73, -68, -73, -68, -73, -68,
+ 40,  87,  40,  87,  40,  87,  40,  87,  40,  87,  40,  87,  40,  87,  40,  87,
+ 62,  68,  62,  68,  62,  68,  62,  68,  62,  68,  62,  68,  62,  68,  62,  68,  // 56
+-55, -73, -55, -73, -55, -73, -55, -73, -55, -73, -55, -73, -55, -73, -55, -73,
+ 48,  77,  48,  77,  48,  77,  48,  77,  48,  77,  48,  77,  48,  77,  48,  77,
+-40, -81, -40, -81, -40, -81, -40, -81, -40, -81, -40, -81, -40, -81, -40, -81,
+ 33,  85,  33,  85,  33,  85,  33,  85,  33,  85,  33,  85,  33,  85,  33,  85,
+-25, -87, -25, -87, -25, -87, -25, -87, -25, -87, -25, -87, -25, -87, -25, -87,
+ 17,  88,  17,  88,  17,  88,  17,  88,  17,  88,  17,  88,  17,  88,  17,  88,
+ -8, -88,  -8, -88,  -8, -88,  -8, -88,  -8, -88,  -8, -88,  -8, -88,  -8, -88,
+ 68,  48,  68,  48,  68,  48,  68,  48,  68,  48,  68,  48,  68,  48,  68,  48,  // 64
+-81, -25, -81, -25, -81, -25, -81, -25, -81, -25, -81, -25, -81, -25, -81, -25,
+ 88,   0,  88,   0,  88,   0,  88,   0,  88,   0,  88,   0,  88,   0,  88,   0,
+-88,  25, -88,  25, -88,  25, -88,  25, -88,  25, -88,  25, -88,  25, -88,  25,
+ 81, -48,  81, -48,  81, -48,  81, -48,  81, -48,  81, -48,  81, -48,  81, -48,
+-68,  68, -68,  68, -68,  68, -68,  68, -68,  68, -68,  68, -68,  68, -68,  68,
+ 48, -81,  48, -81,  48, -81,  48, -81,  48, -81,  48, -81,  48, -81,  48, -81,
+-25,  88, -25,  88, -25,  88, -25,  88, -25,  88, -25,  88, -25,  88, -25,  88,
+ 73,  25,  73,  25,  73,  25,  73,  25,  73,  25,  73,  25,  73,  25,  73,  25,  // 72
+-88,  33, -88,  33, -88,  33, -88,  33, -88,  33, -88,  33, -88,  33, -88,  33,
+ 68, -77,  68, -77,  68, -77,  68, -77,  68, -77,  68, -77,  68, -77,  68, -77,
+-17,  88, -17,  88, -17,  88, -17,  88, -17,  88, -17,  88, -17,  88, -17,  88,
+-40, -62, -40, -62, -40, -62, -40, -62, -40, -62, -40, -62, -40, -62, -40, -62,
+ 81,   8,  81,   8,  81,   8,  81,   8,  81,   8,  81,   8,  81,   8,  81,   8,
+-87,  48, -87,  48, -87,  48, -87,  48, -87,  48, -87,  48, -87,  48, -87,  48,
+ 55, -85,  55, -85,  55, -85,  55, -85,  55, -85,  55, -85,  55, -85,  55, -85,
+ 77,   0,  77,   0,  77,   0,  77,   0,  77,   0,  77,   0,  77,   0,  77,   0,  // 80
+-77,  77, -77,  77, -77,  77, -77,  77, -77,  77, -77,  77, -77,  77, -77,  77,
+  0, -77,   0, -77,   0, -77,   0, -77,   0, -77,   0, -77,   0, -77,   0, -77,
+ 77,   0,  77,   0,  77,   0,  77,   0,  77,   0,  77,   0,  77,   0,  77,   0,
+-77,  77, -77,  77, -77,  77, -77,  77, -77,  77, -77,  77, -77,  77, -77,  77,
+  0, -77,   0, -77,   0, -77,   0, -77,   0, -77,   0, -77,   0, -77,   0, -77,
+ 77,   0,  77,   0,  77,   0,  77,   0,  77,   0,  77,   0,  77,   0,  77,   0,
+-77,  77, -77,  77, -77,  77, -77,  77, -77,  77, -77,  77, -77,  77, -77,  77,
+ 81, -25,  81, -25,  81, -25,  81, -25,  81, -25,  81, -25,  81, -25,  81, -25,  // 88
+-48,  88, -48,  88, -48,  88, -48,  88, -48,  88, -48,  88, -48,  88, -48,  88,
+-68,   0, -68,   0, -68,   0, -68,   0, -68,   0, -68,   0, -68,   0, -68,   0,
+ 68, -88,  68, -88,  68, -88,  68, -88,  68, -88,  68, -88,  68, -88,  68, -88,
+ 48,  25,  48,  25,  48,  25,  48,  25,  48,  25,  48,  25,  48,  25,  48,  25,
+-81,  81, -81,  81, -81,  81, -81,  81, -81,  81, -81,  81, -81,  81, -81,  81,
+-25, -48, -25, -48, -25, -48, -25, -48, -25, -48, -25, -48, -25, -48, -25, -48,
+ 88, -68,  88, -68,  88, -68,  88, -68,  88, -68,  88, -68,  88, -68,  88, -68,
+ 85, -48,  85, -48,  85, -48,  85, -48,  85, -48,  85, -48,  85, -48,  85, -48,  // 96
+ -8,  62,  -8,  62,  -8,  62,  -8,  62,  -8,  62,  -8,  62,  -8,  62,  -8,  62,
+-88,  77, -88,  77, -88,  77, -88,  77, -88,  77, -88,  77, -88,  77, -88,  77,
+-33, -25, -33, -25, -33, -25, -33, -25, -33, -25, -33, -25, -33, -25, -33, -25,
+ 73, -88,  73, -88,  73, -88,  73, -88,  73, -88,  73, -88,  73, -88,  73, -88,
+ 68, -17,  68, -17,  68, -17,  68, -17,  68, -17,  68, -17,  68, -17,  68, -17,
+-40,  81, -40,  81, -40,  81, -40,  81, -40,  81, -40,  81, -40,  81, -40,  81,
+-87,  55, -87,  55, -87,  55, -87,  55, -87,  55, -87,  55, -87,  55, -87,  55,
+ 87, -68,  87, -68,  87, -68,  87, -68,  87, -68,  87, -68,  87, -68,  87, -68,  // 104
+ 33,   8,  33,   8,  33,   8,  33,   8,  33,   8,  33,   8,  33,   8,  33,   8,
+-48,  77, -48,  77, -48,  77, -48,  77, -48,  77, -48,  77, -48,  77, -48,  77,
+-88,  81, -88,  81, -88,  81, -88,  81, -88,  81, -88,  81, -88,  81, -88,  81,
+-55,  17, -55,  17, -55,  17, -55,  17, -55,  17, -55,  17, -55,  17, -55,  17,
+ 25, -62,  25, -62,  25, -62,  25, -62,  25, -62,  25, -62,  25, -62,  25, -62,
+ 85, -88,  85, -88,  85, -88,  85, -88,  85, -88,  85, -88,  85, -88,  85, -88,
+ 73, -40,  73, -40,  73, -40,  73, -40,  73, -40,  73, -40,  73, -40,  73, -40,
+ 88, -81,  88, -81,  88, -81,  88, -81,  88, -81,  88, -81,  88, -81,  88, -81,  // 112
+ 68, -48,  68, -48,  68, -48,  68, -48,  68, -48,  68, -48,  68, -48,  68, -48,
+ 25,   0,  25,   0,  25,   0,  25,   0,  25,   0,  25,   0,  25,   0,  25,   0,
+-25,  48, -25,  48, -25,  48, -25,  48, -25,  48, -25,  48, -25,  48, -25,  48,
+-68,  81, -68,  81, -68,  81, -68,  81, -68,  81, -68,  81, -68,  81, -68,  81,
+-88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88,
+-81,  68, -81,  68, -81,  68, -81,  68, -81,  68, -81,  68, -81,  68, -81,  68,
+-48,  25, -48,  25, -48,  25, -48,  25, -48,  25, -48,  25, -48,  25, -48,  25,
+ 88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  // 120
+ 87, -85,  87, -85,  87, -85,  87, -85,  87, -85,  87, -85,  87, -85,  87, -85,
+ 81, -77,  81, -77,  81, -77,  81, -77,  81, -77,  81, -77,  81, -77,  81, -77,
+ 73, -68,  73, -68,  73, -68,  73, -68,  73, -68,  73, -68,  73, -68,  73, -68,
+ 62, -55,  62, -55,  62, -55,  62, -55,  62, -55,  62, -55,  62, -55,  62, -55,
+ 48, -40,  48, -40,  48, -40,  48, -40,  48, -40,  48, -40,  48, -40,  48, -40,
+ 33, -25,  33, -25,  33, -25,  33, -25,  33, -25,  33, -25,  33, -25,  33, -25,
+ 17,  -8,  17,  -8,  17,  -8,  17,  -8,  17,  -8,  17,  -8,  17,  -8,  17,  -8,
+};
+
+ALIGNED(32) const int16_t  fi_dct8_8x16_coeff_ver[2048] = {
+ 88,  88,  88,  88,  88,  88,  88,  88,  88,  88,  88,  88,  88,  88,  88,  88,  // 0
+ 87,  85,  87,  85,  87,  85,  87,  85,  87,  85,  87,  85,  87,  85,  87,  85,
+ 81,  77,  81,  77,  81,  77,  81,  77,  81,  77,  81,  77,  81,  77,  81,  77,
+ 73,  68,  73,  68,  73,  68,  73,  68,  73,  68,  73,  68,  73,  68,  73,  68,
+ 62,  55,  62,  55,  62,  55,  62,  55,  62,  55,  62,  55,  62,  55,  62,  55,
+ 48,  40,  48,  40,  48,  40,  48,  40,  48,  40,  48,  40,  48,  40,  48,  40,
+ 33,  25,  33,  25,  33,  25,  33,  25,  33,  25,  33,  25,  33,  25,  33,  25,
+ 17,   8,  17,   8,  17,   8,  17,   8,  17,   8,  17,   8,  17,   8,  17,   8,
+ 88,  81,  88,  81,  88,  81,  88,  81,  88,  81,  88,  81,  88,  81,  88,  81,  // 8
+ 68,  48,  68,  48,  68,  48,  68,  48,  68,  48,  68,  48,  68,  48,  68,  48,
+ 25,   0,  25,   0,  25,   0,  25,   0,  25,   0,  25,   0,  25,   0,  25,   0,
+-25, -48, -25, -48, -25, -48, -25, -48, -25, -48, -25, -48, -25, -48, -25, -48,
+-68, -81, -68, -81, -68, -81, -68, -81, -68, -81, -68, -81, -68, -81, -68, -81,
+-88, -88, -88, -88, -88, -88, -88, -88, -88, -88, -88, -88, -88, -88, -88, -88,
+-81, -68, -81, -68, -81, -68, -81, -68, -81, -68, -81, -68, -81, -68, -81, -68,
+-48, -25, -48, -25, -48, -25, -48, -25, -48, -25, -48, -25, -48, -25, -48, -25,
+ 87,  68,  87,  68,  87,  68,  87,  68,  87,  68,  87,  68,  87,  68,  87,  68,  // 16
+ 33,  -8,  33,  -8,  33,  -8,  33,  -8,  33,  -8,  33,  -8,  33,  -8,  33,  -8,
+-48, -77, -48, -77, -48, -77, -48, -77, -48, -77, -48, -77, -48, -77, -48, -77,
+-88, -81, -88, -81, -88, -81, -88, -81, -88, -81, -88, -81, -88, -81, -88, -81,
+-55, -17, -55, -17, -55, -17, -55, -17, -55, -17, -55, -17, -55, -17, -55, -17,
+ 25,  62,  25,  62,  25,  62,  25,  62,  25,  62,  25,  62,  25,  62,  25,  62,
+ 85,  88,  85,  88,  85,  88,  85,  88,  85,  88,  85,  88,  85,  88,  85,  88,
+ 73,  40,  73,  40,  73,  40,  73,  40,  73,  40,  73,  40,  73,  40,  73,  40,
+ 85,  48,  85,  48,  85,  48,  85,  48,  85,  48,  85,  48,  85,  48,  85,  48,  // 24
+ -8, -62,  -8, -62,  -8, -62,  -8, -62,  -8, -62,  -8, -62,  -8, -62,  -8, -62,
+-88, -77, -88, -77, -88, -77, -88, -77, -88, -77, -88, -77, -88, -77, -88, -77,
+-33,  25, -33,  25, -33,  25, -33,  25, -33,  25, -33,  25, -33,  25, -33,  25,
+ 73,  88,  73,  88,  73,  88,  73,  88,  73,  88,  73,  88,  73,  88,  73,  88,
+ 68,  17,  68,  17,  68,  17,  68,  17,  68,  17,  68,  17,  68,  17,  68,  17,
+-40, -81, -40, -81, -40, -81, -40, -81, -40, -81, -40, -81, -40, -81, -40, -81,
+-87, -55, -87, -55, -87, -55, -87, -55, -87, -55, -87, -55, -87, -55, -87, -55,
+ 81,  25,  81,  25,  81,  25,  81,  25,  81,  25,  81,  25,  81,  25,  81,  25,  // 32
+-48, -88, -48, -88, -48, -88, -48, -88, -48, -88, -48, -88, -48, -88, -48, -88,
+-68,   0, -68,   0, -68,   0, -68,   0, -68,   0, -68,   0, -68,   0, -68,   0,
+ 68,  88,  68,  88,  68,  88,  68,  88,  68,  88,  68,  88,  68,  88,  68,  88,
+ 48, -25,  48, -25,  48, -25,  48, -25,  48, -25,  48, -25,  48, -25,  48, -25,
+-81, -81, -81, -81, -81, -81, -81, -81, -81, -81, -81, -81, -81, -81, -81, -81,
+-25,  48, -25,  48, -25,  48, -25,  48, -25,  48, -25,  48, -25,  48, -25,  48,
+ 88,  68,  88,  68,  88,  68,  88,  68,  88,  68,  88,  68,  88,  68,  88,  68,
+ 77,   0,  77,   0,  77,   0,  77,   0,  77,   0,  77,   0,  77,   0,  77,   0,  // 40
+-77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77,
+  0,  77,   0,  77,   0,  77,   0,  77,   0,  77,   0,  77,   0,  77,   0,  77,
+ 77,   0,  77,   0,  77,   0,  77,   0,  77,   0,  77,   0,  77,   0,  77,   0,
+-77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77,
+  0,  77,   0,  77,   0,  77,   0,  77,   0,  77,   0,  77,   0,  77,   0,  77,
+ 77,   0,  77,   0,  77,   0,  77,   0,  77,   0,  77,   0,  77,   0,  77,   0,
+-77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77,
+ 73, -25,  73, -25,  73, -25,  73, -25,  73, -25,  73, -25,  73, -25,  73, -25,  // 48
+-88, -33, -88, -33, -88, -33, -88, -33, -88, -33, -88, -33, -88, -33, -88, -33,
+ 68,  77,  68,  77,  68,  77,  68,  77,  68,  77,  68,  77,  68,  77,  68,  77,
+-17, -88, -17, -88, -17, -88, -17, -88, -17, -88, -17, -88, -17, -88, -17, -88,
+-40,  62, -40,  62, -40,  62, -40,  62, -40,  62, -40,  62, -40,  62, -40,  62,
+ 81,  -8,  81,  -8,  81,  -8,  81,  -8,  81,  -8,  81,  -8,  81,  -8,  81,  -8,
+-87, -48, -87, -48, -87, -48, -87, -48, -87, -48, -87, -48, -87, -48, -87, -48,
+ 55,  85,  55,  85,  55,  85,  55,  85,  55,  85,  55,  85,  55,  85,  55,  85,
+ 68, -48,  68, -48,  68, -48,  68, -48,  68, -48,  68, -48,  68, -48,  68, -48,  // 56
+-81,  25, -81,  25, -81,  25, -81,  25, -81,  25, -81,  25, -81,  25, -81,  25,
+ 88,   0,  88,   0,  88,   0,  88,   0,  88,   0,  88,   0,  88,   0,  88,   0,
+-88, -25, -88, -25, -88, -25, -88, -25, -88, -25, -88, -25, -88, -25, -88, -25,
+ 81,  48,  81,  48,  81,  48,  81,  48,  81,  48,  81,  48,  81,  48,  81,  48,
+-68, -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, -68,
+ 48,  81,  48,  81,  48,  81,  48,  81,  48,  81,  48,  81,  48,  81,  48,  81,
+-25, -88, -25, -88, -25, -88, -25, -88, -25, -88, -25, -88, -25, -88, -25, -88,
+ 62, -68,  62, -68,  62, -68,  62, -68,  62, -68,  62, -68,  62, -68,  62, -68,  // 64
+-55,  73, -55,  73, -55,  73, -55,  73, -55,  73, -55,  73, -55,  73, -55,  73,
+ 48, -77,  48, -77,  48, -77,  48, -77,  48, -77,  48, -77,  48, -77,  48, -77,
+-40,  81, -40,  81, -40,  81, -40,  81, -40,  81, -40,  81, -40,  81, -40,  81,
+ 33, -85,  33, -85,  33, -85,  33, -85,  33, -85,  33, -85,  33, -85,  33, -85,
+-25,  87, -25,  87, -25,  87, -25,  87, -25,  87, -25,  87, -25,  87, -25,  87,
+ 17, -88,  17, -88,  17, -88,  17, -88,  17, -88,  17, -88,  17, -88,  17, -88,
+ -8,  88,  -8,  88,  -8,  88,  -8,  88,  -8,  88,  -8,  88,  -8,  88,  -8,  88,
+ 55, -81,  55, -81,  55, -81,  55, -81,  55, -81,  55, -81,  55, -81,  55, -81,  // 72
+-17,  88, -17,  88, -17,  88, -17,  88, -17,  88, -17,  88, -17,  88, -17,  88,
+-25, -77, -25, -77, -25, -77, -25, -77, -25, -77, -25, -77, -25, -77, -25, -77,
+ 62,  48,  62,  48,  62,  48,  62,  48,  62,  48,  62,  48,  62,  48,  62,  48,
+-85,  -8, -85,  -8, -85,  -8, -85,  -8, -85,  -8, -85,  -8, -85,  -8, -85,  -8,
+ 88, -33,  88, -33,  88, -33,  88, -33,  88, -33,  88, -33,  88, -33,  88, -33,
+-73,  68, -73,  68, -73,  68, -73,  68, -73,  68, -73,  68, -73,  68, -73,  68,
+ 40, -87,  40, -87,  40, -87,  40, -87,  40, -87,  40, -87,  40, -87,  40, -87,
+ 48, -88,  48, -88,  48, -88,  48, -88,  48, -88,  48, -88,  48, -88,  48, -88,  // 80
+ 25,  68,  25,  68,  25,  68,  25,  68,  25,  68,  25,  68,  25,  68,  25,  68,
+-81,   0, -81,   0, -81,   0, -81,   0, -81,   0, -81,   0, -81,   0, -81,   0,
+ 81, -68,  81, -68,  81, -68,  81, -68,  81, -68,  81, -68,  81, -68,  81, -68,
+-25,  88, -25,  88, -25,  88, -25,  88, -25,  88, -25,  88, -25,  88, -25,  88,
+-48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48,
+ 88, -25,  88, -25,  88, -25,  88, -25,  88, -25,  88, -25,  88, -25,  88, -25,
+-68,  81, -68,  81, -68,  81, -68,  81, -68,  81, -68,  81, -68,  81, -68,  81,
+ 40, -88,  40, -88,  40, -88,  40, -88,  40, -88,  40, -88,  40, -88,  40, -88,  // 88
+ 62,  17,  62,  17,  62,  17,  62,  17,  62,  17,  62,  17,  62,  17,  62,  17,
+-81,  77, -81,  77, -81,  77, -81,  77, -81,  77, -81,  77, -81,  77, -81,  77,
+ -8, -68,  -8, -68,  -8, -68,  -8, -68,  -8, -68,  -8, -68,  -8, -68,  -8, -68,
+ 87, -33,  87, -33,  87, -33,  87, -33,  87, -33,  87, -33,  87, -33,  87, -33,
+-48,  88, -48,  88, -48,  88, -48,  88, -48,  88, -48,  88, -48,  88, -48,  88,
+-55, -25, -55, -25, -55, -25, -55, -25, -55, -25, -55, -25, -55, -25, -55, -25,
+ 85, -73,  85, -73,  85, -73,  85, -73,  85, -73,  85, -73,  85, -73,  85, -73,
+ 33, -81,  33, -81,  33, -81,  33, -81,  33, -81,  33, -81,  33, -81,  33, -81,  // 96
+ 85, -40,  85, -40,  85, -40,  85, -40,  85, -40,  85, -40,  85, -40,  85, -40,
+-25,  77, -25,  77, -25,  77, -25,  77, -25,  77, -25,  77, -25,  77, -25,  77,
+-87,  48, -87,  48, -87,  48, -87,  48, -87,  48, -87,  48, -87,  48, -87,  48,
+ 17, -73,  17, -73,  17, -73,  17, -73,  17, -73,  17, -73,  17, -73,  17, -73,
+ 88, -55,  88, -55,  88, -55,  88, -55,  88, -55,  88, -55,  88, -55,  88, -55,
+ -8,  68,  -8,  68,  -8,  68,  -8,  68,  -8,  68,  -8,  68,  -8,  68,  -8,  68,
+-88,  62, -88,  62, -88,  62, -88,  62, -88,  62, -88,  62, -88,  62, -88,  62,
+ 25, -68,  25, -68,  25, -68,  25, -68,  25, -68,  25, -68,  25, -68,  25, -68,  // 104
+ 88, -81,  88, -81,  88, -81,  88, -81,  88, -81,  88, -81,  88, -81,  88, -81,
+ 48,   0,  48,   0,  48,   0,  48,   0,  48,   0,  48,   0,  48,   0,  48,   0,
+-48,  81, -48,  81, -48,  81, -48,  81, -48,  81, -48,  81, -48,  81, -48,  81,
+-88,  68, -88,  68, -88,  68, -88,  68, -88,  68, -88,  68, -88,  68, -88,  68,
+-25, -25, -25, -25, -25, -25, -25, -25, -25, -25, -25, -25, -25, -25, -25, -25,
+ 68, -88,  68, -88,  68, -88,  68, -88,  68, -88,  68, -88,  68, -88,  68, -88,
+ 81, -48,  81, -48,  81, -48,  81, -48,  81, -48,  81, -48,  81, -48,  81, -48,
+ 17, -48,  17, -48,  17, -48,  17, -48,  17, -48,  17, -48,  17, -48,  17, -48,  // 112
+ 73, -87,  73, -87,  73, -87,  73, -87,  73, -87,  73, -87,  73, -87,  73, -87,
+ 88, -77,  88, -77,  88, -77,  88, -77,  88, -77,  88, -77,  88, -77,  88, -77,
+ 55, -25,  55, -25,  55, -25,  55, -25,  55, -25,  55, -25,  55, -25,  55, -25,
+ -8,  40,  -8,  40,  -8,  40,  -8,  40,  -8,  40,  -8,  40,  -8,  40,  -8,  40,
+-68,  85, -68,  85, -68,  85, -68,  85, -68,  85, -68,  85, -68,  85, -68,  85,
+-88,  81, -88,  81, -88,  81, -88,  81, -88,  81, -88,  81, -88,  81, -88,  81,
+-62,  33, -62,  33, -62,  33, -62,  33, -62,  33, -62,  33, -62,  33, -62,  33,
+  8, -25,   8, -25,   8, -25,   8, -25,   8, -25,   8, -25,   8, -25,   8, -25,  // 120
+ 40, -55,  40, -55,  40, -55,  40, -55,  40, -55,  40, -55,  40, -55,  40, -55,
+ 68, -77,  68, -77,  68, -77,  68, -77,  68, -77,  68, -77,  68, -77,  68, -77,
+ 85, -88,  85, -88,  85, -88,  85, -88,  85, -88,  85, -88,  85, -88,  85, -88,
+ 88, -87,  88, -87,  88, -87,  88, -87,  88, -87,  88, -87,  88, -87,  88, -87,
+ 81, -73,  81, -73,  81, -73,  81, -73,  81, -73,  81, -73,  81, -73,  81, -73,
+ 62, -48,  62, -48,  62, -48,  62, -48,  62, -48,  62, -48,  62, -48,  62, -48,
+ 33, -17,  33, -17,  33, -17,  33, -17,  33, -17,  33, -17,  33, -17,  33, -17,
+};
+
+
+ALIGNED(32) const int16_t  ff_dct2_8x32_coeff_ver[1024] = {
+ 64,  64,  90,  90,  90,  87,  90,  82,  89,  75,  88,  67,  87,  57,  85,  46,  // 0
+ 83,  36,  82,  22,  80,   9,  78,  -4,  75, -18,  73, -31,  70, -43,  67, -54,
+ 64, -64,  61, -73,  57, -80,  54, -85,  50, -89,  46, -90,  43, -90,  38, -88,
+ 36, -83,  31, -78,  25, -70,  22, -61,  18, -50,  13, -38,   9, -25,   4, -13,
+ 64,  64,  88,  85,  80,  70,  67,  46,  50,  18,  31, -13,   9, -43, -13, -67,
+-36, -83, -54, -90, -70, -87, -82, -73, -89, -50, -90, -22, -87,   9, -78,  38,
+-64,  64, -46,  82, -25,  90,  -4,  88,  18,  75,  38,  54,  57,  25,  73,  -4,
+ 83, -36,  90, -61,  90, -80,  85, -90,  75, -89,  61, -78,  43, -57,  22, -31,
+ 64,  64,  82,  78,  57,  43,  22,  -4, -18, -50, -54, -82, -80, -90, -90, -73,  // 8
+-83, -36, -61,  13, -25,  57,  13,  85,  50,  89,  78,  67,  90,  25,  85, -22,
+ 64, -64,  31, -88,  -9, -87, -46, -61, -75, -18, -90,  31, -87,  70, -67,  90,
+-36,  83,   4,  54,  43,   9,  73, -38,  89, -75,  88, -90,  70, -80,  38, -46,
+ 64,  64,  73,  67,  25,   9, -31, -54, -75, -89, -90, -78, -70, -25, -22,  38,
+ 36,  83,  78,  85,  90,  43,  67, -22,  18, -75, -38, -90, -80, -57, -90,   4,
+-64,  64, -13,  90,  43,  70,  82,  13,  89, -50,  61, -88,   9, -80, -46, -31,
+-83,  36, -88,  82, -57,  87,  -4,  46,  50, -18,  85, -73,  87, -90,  54, -61,
+ 64,  64,  61,  54,  -9, -25, -73, -85, -89, -75, -46,  -4,  25,  70,  82,  88,  // 16
+ 83,  36,  31, -46, -43, -90, -88, -61, -75,  18, -13,  82,  57,  80,  90,  13,
+ 64, -64,  -4, -90, -70, -43, -90,  38, -50,  89,  22,  67,  80,  -9,  85, -78,
+ 36, -83, -38, -22, -87,  57, -78,  90, -18,  50,  54, -31,  90, -87,  67, -73,
+ 64,  64,  46,  38, -43, -57, -90, -88, -50, -18,  38,  73,  90,  80,  54,  -4,
+-36, -83, -90, -67, -57,  25,  31,  90,  89,  50,  61, -46, -25, -90, -88, -31,
+-64,  64,  22,  85,  87,   9,  67, -78, -18, -75, -85,  13, -70,  87,  13,  61,
+ 83, -36,  73, -90,  -9, -43, -82,  54, -75,  89,   4,  22,  80, -70,  78, -82,
+ 64,  64,  31,  22, -70, -80, -78, -61,  18,  50,  90,  85,  43,  -9, -61, -90,  // 24
+-83, -36,   4,  73,  87,  70,  54, -38, -50, -89, -88,  -4,  -9,  87,  82,  46,
+ 64, -64, -38, -78, -90,  25, -22,  90,  75,  18,  73, -82, -25, -57, -90,  54,
+-36,  83,  67, -13,  80, -90, -13, -31, -89,  75, -46,  67,  57, -43,  85, -88,
+ 64,  64,  13,   4, -87, -90, -38, -13,  75,  89,  61,  22, -57, -87, -78, -31,
+ 36,  83,  88,  38,  -9, -80, -90, -46, -18,  75,  85,  54,  43, -70, -73, -61,
+-64,  64,  54,  67,  80, -57, -31, -73, -89,  50,   4,  78,  90, -43,  22, -82,
+-83,  36, -46,  85,  70, -25,  67, -88, -50,  18, -82,  90,  25,  -9,  90, -90,
+ 64,  64,  -4, -13, -90, -87,  13,  38,  89,  75, -22, -61, -87, -57,  31,  78,  // 32
+ 83,  36, -38, -88, -80,  -9,  46,  90,  75, -18, -54, -85, -70,  43,  61,  73,
+ 64, -64, -67, -54, -57,  80,  73,  31,  50, -89, -78,  -4, -43,  90,  82, -22,
+ 36, -83, -85,  46, -25,  70,  88, -67,  18, -50, -90,  82,  -9,  25,  90, -90,
+ 64,  64, -22, -31, -80, -70,  61,  78,  50,  18, -85, -90,  -9,  43,  90,  61,
+-36, -83, -73,  -4,  70,  87,  38, -54, -89, -50,   4,  88,  87,  -9, -46, -82,
+-64,  64,  78,  38,  25, -90, -90,  22,  18,  75,  82, -73, -57, -25, -54,  90,
+ 83, -36,  13, -67, -90,  80,  31,  13,  75, -89, -67,  46, -43,  57,  88, -85,
+ 64,  64, -38, -46, -57, -43,  88,  90, -18, -50, -73, -38,  80,  90,   4, -54,  // 40
+-83, -36,  67,  90,  25, -57, -90, -31,  50,  89,  46, -61, -90, -25,  31,  88,
+ 64, -64, -85, -22,   9,  87,  78, -67, -75, -18, -13,  85,  87, -70, -61, -13,
+-36,  83,  90, -73, -43,  -9, -54,  82,  89, -75, -22,  -4, -70,  80,  82, -78,
+ 64,  64, -54, -61, -25,  -9,  85,  73, -75, -89,   4,  46,  70,  25, -88, -82,
+ 36,  83,  46, -31, -90, -43,  61,  88,  18, -75, -82,  13,  80,  57, -13, -90,
+-64,  64,  90,   4, -43, -70, -38,  90,  89, -50, -67, -22,  -9,  80,  78, -85,
+-83,  36,  22,  38,  57, -87, -90,  78,  50, -18,  31, -54, -87,  90,  73, -67,
+ 64,  64, -67, -73,   9,  25,  54,  31, -89, -75,  78,  90, -25, -70, -38,  22,  // 48
+ 83,  36, -85, -78,  43,  90,  22, -67, -75,  18,  90,  38, -57, -80,  -4,  90,
+ 64, -64, -90,  13,  70,  43, -13, -82, -50,  89,  88, -61, -80,   9,  31,  46,
+ 36, -83, -82,  88,  87, -57, -46,   4, -18,  50,  73, -85, -90,  87,  61, -54,
+ 64,  64, -78, -82,  43,  57,   4, -22, -50, -18,  82,  54, -90, -80,  73,  90,
+-36, -83, -13,  61,  57, -25, -85, -13,  89,  50, -67, -78,  25,  90,  22, -85,
+-64,  64,  88, -31, -87,  -9,  61,  46, -18, -75, -31,  90,  70, -87, -90,  67,
+ 83, -36, -54,  -4,   9,  43,  38, -73, -75,  89,  90, -88, -80,  70,  46, -38,
+ 64,  64, -85, -88,  70,  80, -46, -67,  18,  50,  13, -31, -43,   9,  67,  13,  // 56
+-83, -36,  90,  54, -87, -70,  73,  82, -50, -89,  22,  90,   9, -87, -38,  78,
+ 64, -64, -82,  46,  90, -25, -88,   4,  75,  18, -54, -38,  25,  57,   4, -73,
+-36,  83,  61, -90, -80,  90,  90, -85, -89,  75,  78, -61, -57,  43,  31, -22,
+ 64,  64, -90, -90,  87,  90, -82, -90,  75,  89, -67, -88,  57,  87, -46, -85,
+ 36,  83, -22, -82,   9,  80,   4, -78, -18,  75,  31, -73, -43,  70,  54, -67,
+-64,  64,  73, -61, -80,  57,  85, -54, -89,  50,  90, -46, -90,  43,  88, -38,
+-83,  36,  78, -31, -70,  25,  61, -22, -50,  18,  38, -13, -25,   9,  13,  -4,
+};
+
+ALIGNED(32) const int16_t  ff_dst7_8x32_coeff_ver[1024] = {
+  4,   9,  13,  26,  21,  42,  30,  56,  38,  68,  46,  78,  53,  85,  60,  89,  // 0
+ 66,  90,  72,  86,  77,  80,  80,  72,  84,  60,  86,  46,  88,  30,  90,  13,
+ 90,  -4,  89, -21,  87, -38,  85, -53,  82, -66,  78, -77,  74, -84,  68, -88,
+ 63, -90,  56, -87,  50, -82,  42, -74,  34, -63,  26, -50,  17, -34,   9, -17,
+ 13,  17,  38,  50,  60,  74,  77,  87,  86,  88,  90,  77,  85,  53,  74,  21,
+ 56, -13,  34, -46,   9, -72, -17, -86, -42, -89, -63, -78, -78, -56, -87, -26,
+-90,   9, -84,  42, -72,  68, -53,  85, -30,  90,  -4,  80,  21,  60,  46,  30,
+ 66,  -4,  80, -38,  88, -66,  89, -84,  82, -90,  68, -82,  50, -63,  26, -34,
+ 21,  26,  60,  68,  84,  89,  89,  80,  74,  46,  42,  -4,   0, -53, -42, -84,  // 8
+-74, -87, -89, -63, -84, -17, -60,  34, -21,  74,  21,  90,  60,  77,  84,  38,
+ 89, -13,  74, -60,  42, -86,   0, -85, -42, -56, -74,  -9, -89,  42, -84,  78,
+-60,  90, -21,  72,  21,  30,  60, -21,  84, -66,  89, -88,  74, -82,  42, -50,
+ 30,  34,  77,  82,  89,  84,  63,  38,   9, -30, -50, -80, -85, -85, -84, -42,
+-46,  26,  13,  78,  66,  86,  90,  46,  74, -21,  26, -77, -34, -87, -78, -50,
+-88,  17, -60,  74,  -4,  88,  53,  53,  86, -13,  82, -72,  42, -89, -17, -56,
+-68,   9, -90,  68, -72,  90, -21,  60,  38,  -4,  80, -66,  87, -90,  56, -63,
+ 38,  42,  86,  89,  74,  60,   9, -21, -63, -84, -90, -74, -53,   0,  21,  74,  // 16
+ 80,  84,  82,  21,  26, -60, -50, -89, -89, -42, -66,  42,   4,  89,  72,  60,
+ 87, -21,  42, -84, -34, -74, -85,   0, -77,  74, -13,  84,  60,  21,  90, -60,
+ 56, -89, -17, -42, -78,  42, -84,  89, -30,  60,  46, -21,  88, -84,  68, -74,
+ 46,  50,  90,  88,  42,  21, -50, -72, -90, -78, -38,   9,  53,  85,  89,  60,
+ 34, -38, -56, -90, -88, -34, -30,  63,  60,  84,  87,   4,  26, -80, -63, -68,
+-86,  26, -21,  89,  66,  46,  85, -53,  17, -87, -68, -17, -84,  74, -13,  77,
+ 72, -13,  82, -86,   9, -56, -74,  42, -80,  90,  -4,  30,  77, -66,  78, -82,
+ 53,  56,  85,  80,   0, -21, -85, -90, -53, -17,  53,  82,  85,  53,   0, -60,  // 24
+-85, -78, -53,  26,  53,  90,  85,  13,   0, -84, -85, -50, -53,  63,  53,  77,
+ 85, -30,   0, -89, -85,  -9, -53,  85,  53,  46,  85, -66,   0, -74, -85,  34,
+-53,  88,  53,   4,  85, -86,   0, -42, -85,  68, -53,  72,  53, -38,  85, -87,
+ 60,  63,  74,  66, -42, -60, -84, -68,  21,  56,  89,  72,   0, -53, -89, -74,
+-21,  50,  84,  77,  42, -46, -74, -78, -60,  42,  60,  80,  74, -38, -42, -82,
+-84,  34,  21,  84,  89, -30,   0, -85, -89,  26, -21,  86,  84, -21,  42, -87,
+-74,  17, -60,  88,  60, -13,  74, -89, -42,   9, -84,  90,  21,  -4,  89, -90,
+ 66,  68,  56,  46, -74, -84, -46, -17,  80,  90,  34, -13, -85, -85, -21,  42,  // 32
+ 88,  72,   9, -66, -90, -50,   4,  82,  89,  21, -17, -90, -86,   9,  30,  86,
+ 82, -38, -42, -74, -77,  63,  53,  53,  68, -80, -63, -26, -60,  89,  72,  -4,
+ 50, -87, -78,  34, -38,  77,  84, -60,  26, -56, -87,  78, -13,  30,  90, -88,
+ 72,  74,  34,  21, -89, -89,  13,  42,  82,  60, -56, -84, -53,   0,  84,  84,
+  9, -60, -88, -42,  38,  89,  68, -21, -74, -74, -30,  74,  90,  21, -17, -89,
+-80,  42,  60,  60,  50, -84, -85,   0,  -4,  84,  87, -60, -42, -42, -66,  89,
+ 77, -21,  26, -74, -90,  74,  21,  21,  78, -89, -63,  42, -46,  60,  86, -84,
+ 77,  78,   9,  -4, -84, -74,  66,  82,  26, -13, -88, -68,  53,  85,  42, -21,  // 40
+-90, -63,  38,  87,  56, -30, -87, -56,  21,  89,  68, -38, -82, -50,   4,  90,
+ 78, -46, -74, -42, -13,  90,  85, -53, -63, -34, -30,  88,  89, -60, -50, -26,
+-46,  86,  90, -66, -34, -17, -60,  84,  86, -72, -17,  -9, -72,  80,  80, -77,
+ 80,  82, -17, -30, -60, -42,  90,  86, -50, -77, -30,  17,  85,  53, -74, -89,
+  4,  68,  68,  -4, -87, -63,  38,  90,  42, -60, -88,  -9,  66,  72,   9, -88,
+-77,  50,  84,  21, -26, -78, -53,  85,  90, -38, -56, -34, -21,  84,  82, -80,
+-78,  26,  13,  46,  63, -87, -89,  74,  46, -13,  34, -56, -86,  90,  72, -66,
+ 84,  85, -42, -53, -21,   0,  74,  53, -89, -85,  60,  85,   0, -53, -60,   0,  // 48
+ 89,  53, -74, -85,  21,  85,  42, -53, -84,   0,  84,  53, -42, -85, -21,  85,
+ 74, -53, -89,   0,  60,  53,   0, -85, -60,  85,  89, -53, -74,   0,  21,  53,
+ 42, -85, -84,  85,  84, -53, -42,   0, -21,  53,  74, -85, -89,  85,  60, -53,
+ 86,  87, -63, -72,  21,  42,  26,  -4, -66, -34,  87,  66, -85, -85,  60,  89,
+-17, -77, -30,  50,  68, -13, -88, -26,  84,  60, -56, -82,  13,  90,  34, -80,
+-72,  56,  89, -21, -82, -17,  53,  53,  -9, -78, -38,  90,  74, -84, -90,  63,
+ 80, -30, -50,  -9,   4,  46,  42, -74, -77,  88,  90, -86, -78,  68,  46, -38,
+ 88,  89, -78, -84,  60,  74, -34, -60,   4,  42,  26, -21, -53,   0,  74,  21,  // 56
+-86, -42,  90,  60, -82, -74,  66,  84, -42, -89,  13,  89,  17, -84, -46,  74,
+ 68, -60, -84,  42,  90, -21, -85,   0,  72,  21, -50, -42,  21,  60,   9, -74,
+-38,  84,  63, -89, -80,  89,  89, -84, -87,  74,  77, -60, -56,  42,  30, -21,
+ 90,  90, -87, -90,  84,  89, -78, -88,  72,  87, -63, -86,  53,  85, -42, -84,
+ 30,  82, -17, -80,   4,  78,   9, -77, -21,  74,  34, -72, -46,  68,  56, -66,
+-66,  63,  74, -60, -80,  56,  85, -53, -88,  50,  90, -46, -89,  42,  86, -38,
+-82,  34,  77, -30, -68,  26,  60, -21, -50,  17,  38, -13, -26,   9,  13,  -4,
+};
+
+ALIGNED(32) const int16_t  ff_dct8_8x32_coeff_ver[1024] = {
+ 90,  90,  90,  87,  89,  84,  88,  78,  87,  72,  86,  63,  85,  53,  84,  42,  // 0
+ 82,  30,  80,  17,  78,   4,  77,  -9,  74, -21,  72, -34,  68, -46,  66, -56,
+ 63, -66,  60, -74,  56, -80,  53, -85,  50, -88,  46, -90,  42, -89,  38, -86,
+ 34, -82,  30, -77,  26, -68,  21, -60,  17, -50,  13, -38,   9, -26,   4, -13,
+ 89,  88,  84,  78,  74,  60,  60,  34,  42,   4,  21, -26,   0, -53, -21, -74,
+-42, -86, -60, -90, -74, -82, -84, -66, -89, -42, -89, -13, -84,  17, -74,  46,
+-60,  68, -42,  84, -21,  90,   0,  85,  21,  72,  42,  50,  60,  21,  74,  -9,
+ 84, -38,  89, -63,  89, -80,  84, -89,  74, -87,  60, -77,  42, -56,  21, -30,
+ 87,  86,  72,  63,  42,  21,   4, -26, -34, -66, -66, -87, -85, -85, -89, -60,  // 8
+-77, -17, -50,  30, -13,  68,  26,  88,  60,  84,  82,  56,  90,  13,  80, -34,
+ 56, -72,  21, -89, -17, -82, -53, -53, -78,  -9, -90,  38, -84,  74, -63,  90,
+-30,  80,   9,  50,  46,   4,  74, -42,  88, -77,  86, -90,  68, -78,  38, -46,
+ 85,  84,  53,  42,   0, -21, -53, -74, -85, -89, -85, -60, -53,   0,   0,  60,
+ 53,  89,  85,  74,  85,  21,  53, -42,   0, -84, -53, -84, -85, -42, -85,  21,
+-53,  74,   0,  89,  53,  60,  85,   0,  85, -60,  53, -89,   0, -74, -53, -21,
+-85,  42, -85,  84, -53,  84,   0,  42,  53, -21,  85, -74,  85, -89,  53, -60,
+ 82,  80,  30,  17, -42, -60, -86, -90, -77, -50, -17,  30,  53,  85,  89,  74,  // 16
+ 68,   4,   4, -68, -63, -87, -90, -38, -60,  42,   9,  88,  72,  66,  88,  -9,
+ 50, -77, -21, -84, -78, -26, -85,  53, -38,  90,  34,  56,  84, -21,  80, -82,
+ 26, -78, -46, -13, -87,  63, -74,  89, -13,  46,  56, -34,  90, -86,  66, -72,
+ 78,  77,   4,  -9, -74, -84, -82, -66, -13,  26,  68,  88,  85,  53,  21, -42,
+-63, -90, -87, -38, -30,  56,  56,  87,  89,  21,  38, -68, -50, -82, -90,  -4,
+-46,  78,  42,  74,  90, -13,  53, -85, -34, -63, -88,  30, -60,  89,  26,  50,
+ 86, -46,  66, -90, -17, -34, -84,  60, -72,  86,   9,  17,  80, -72,  77, -80,
+ 74,  72, -21, -34, -89, -89, -42, -13,  60,  82,  84,  56,   0, -53, -84, -84,  // 24
+-60,   9,  42,  88,  89,  38,  21, -68, -74, -74, -74,  30,  21,  90,  89,  17,
+ 42, -80, -60, -60, -84,  50,   0,  85,  84,  -4,  60, -87, -42, -42, -89,  66,
+-21,  77,  74, -26,  74, -90, -21, -21, -89,  78, -42,  63,  60, -46,  84, -86,
+ 68,  66, -46, -56, -84, -74,  17,  46,  90,  80,  13, -34, -85, -85, -42,  21,
+ 72,  88,  66,  -9, -50, -90, -82,  -4,  21,  89,  90,  17,   9, -86, -86, -30,
+-38,  82,  74,  42,  63, -77, -53, -53, -80,  68,  26,  63,  89, -60,   4, -72,
+-87,  50, -34,  78,  77, -38,  60, -84, -56,  26, -78,  87,  30, -13,  88, -90,
+ 63,  60, -66, -74, -60, -42,  68,  84,  56,  21, -72, -89, -53,   0,  74,  89,  // 32
+ 50, -21, -77, -84, -46,  42,  78,  74,  42, -60, -80, -60, -38,  74,  82,  42,
+ 34, -84, -84, -21, -30,  89,  85,   0,  26, -89, -86,  21, -21,  84,  87, -42,
+ 17, -74, -88,  60, -13,  60,  89, -74,   9, -42, -90,  84,  -4,  21,  90, -89,
+ 56,  53, -80, -85, -21,   0,  90,  85, -17, -53, -82, -53,  53,  85,  60,   0,
+-78, -85, -26,  53,  90,  53, -13, -85, -84,   0,  50,  85,  63, -53, -77, -53,
+-30,  85,  89,   0,  -9, -85, -85,  53,  46,  53,  66, -85, -74,   0, -34,  85,
+ 88, -53,  -4, -53, -86,  85,  42,   0,  68, -85, -72,  53, -38,  53,  87, -85,
+ 50,  46, -88, -90,  21,  42,  72,  50, -78, -90,  -9,  38,  85,  53, -60, -89,  // 40
+-38,  34,  90,  56, -34, -88, -63,  30,  84,  60,  -4, -87, -80,  26,  68,  63,
+ 26, -86, -89,  21,  46,  66,  53, -85, -87,  17,  17,  68,  74, -84, -77,  13,
+-13,  72,  86, -82, -56,   9, -42,  74,  90, -80, -30,   4, -66,  77,  82, -78,
+ 42,  38, -89, -86,  60,  74,  21,  -9, -84, -63,  74,  90,   0, -53, -74, -21,
+ 84,  80, -21, -82, -60,  26,  89,  50, -42, -89, -42,  66,  89,   4, -60, -72,
+-21,  87,  84, -42, -74, -34,   0,  85,  74, -77, -84,  13,  21,  60,  60, -90,
+-89,  56,  42,  17,  42, -78, -89,  84,  60, -30,  21, -46, -84,  88,  74, -68,
+ 34,  30, -82, -77,  84,  89, -38, -63, -30,   9,  80,  50, -85, -85,  42,  84,  // 48
+ 26, -46, -78, -13,  86,  66, -46, -90, -21,  74,  77, -26, -87, -34,  50,  78,
+ 17, -88, -74,  60,  88,  -4, -53, -53, -13,  86,  72, -82, -89,  42,  56,  17,
+  9, -68, -68,  90,  90, -72, -60,  21,  -4,  38,  66, -80, -90,  87,  63, -56,
+ 26,  21, -68, -60,  89,  84, -80, -89,  46,  74,   4, -42, -53,   0,  84,  42,
+-87, -74,  63,  89, -17, -84, -34,  60,  74, -21, -90, -21,  77,  60, -38, -84,
+-13,  89,  60, -74, -86,  42,  85,   0, -56, -42,   9,  74,  42, -89, -78,  84,
+ 90, -60, -72,  21,  30,  21,  21, -60, -66,  84,  88, -89, -82,  74,  50, -42,
+ 17,  13, -50, -38,  74,  60, -87, -77,  88,  86, -77, -90,  53,  85, -21, -74,  // 56
+-13,  56,  46, -34, -72,   9,  86,  17, -89, -42,  78,  63, -56, -78,  26,  87,
+  9, -90, -42,  84,  68, -72, -85,  53,  90, -30, -80,   4,  60,  21, -30, -46,
+ -4,  66,  38, -80, -66,  88,  84, -89, -90,  82,  82, -68, -63,  50,  34, -26,
+  9,   4, -26, -13,  42,  21, -56, -30,  68,  38, -78, -46,  85,  53, -89, -60,
+ 90,  66, -86, -72,  80,  77, -72, -80,  60,  84, -46, -86,  30,  88, -13, -90,
+ -4,  90,  21, -89, -38,  87,  53, -85, -66,  82,  77, -78, -84,  74,  88, -68,
+-90,  63,  87, -56, -82,  50,  74, -42, -63,  34,  50, -26, -34,  17,  17,  -9,
+};
+
+
+            const int16_t* fi_dct2_8x32_coeff_hor = fi_dct2_8x8_coeff_hor;
+
+            const int16_t* fi_dst7_8x32_coeff_hor = fi_dst7_8x8_coeff_hor;
+
+            const int16_t* fi_dct8_8x32_coeff_hor = fi_dct8_8x8_coeff_hor;
+
+
+// 16xN
+ALIGNED(32) const int16_t  ff_dct2_16xN_coeff_hor[256] = {
+ 64,  64,  90,  87,  89,  75,  87,  57,  64, -64,  57, -80,  50, -89,  43, -90,
+ 64,  64,  80,  70,  50,  18,   9, -43, -64,  64, -25,  90,  18,  75,  57,  25,
+ 64,  64,  57,  43, -18, -50, -80, -90,  64, -64,  -9, -87, -75, -18, -87,  70,
+ 64,  64,  25,   9, -75, -89, -70, -25, -64,  64,  43,  70,  89, -50,   9, -80,
+ 64,  64,  -9, -25, -89, -75,  25,  70,  64, -64, -70, -43, -50,  89,  80,  -9,
+ 64,  64, -43, -57, -50, -18,  90,  80, -64,  64,  87,   9, -18, -75, -70,  87,
+ 64,  64, -70, -80,  18,  50,  43,  -9,  64, -64, -90,  25,  75,  18, -25, -57,
+ 64,  64, -87, -90,  75,  89, -57, -87, -64,  64,  80, -57, -89,  50,  90, -43,
+ 83,  36,  80,   9,  75, -18,  70, -43,  36, -83,  25, -70,  18, -50,   9, -25,
+-36, -83, -70, -87, -89, -50, -87,   9,  83, -36,  90, -80,  75, -89,  43, -57,
+-83, -36, -25,  57,  50,  89,  90,  25, -36,  83,  43,   9,  89, -75,  70, -80,
+ 36,  83,  90,  43,  18, -75, -80, -57, -83,  36, -57,  87,  50, -18,  87, -90,
+ 83,  36, -43, -90, -75,  18,  57,  80,  36, -83, -87,  57, -18,  50,  90, -87,
+-36, -83, -57,  25,  89,  50, -25, -90,  83, -36,  -9, -43, -75,  89,  80, -70,
+-83, -36,  87,  70, -50, -89,  -9,  87, -36,  83,  80, -90, -89,  75,  57, -43,
+ 36,  83,  -9, -80, -18,  75,  43, -70, -83,  36,  70, -25, -50,  18,  25,  -9,
+};
+
+ALIGNED(32) const int16_t  ff_dst7_16xN_coeff_hor[256] = {
+  8,  17,  25,  48,  40,  73,  55,  87,  88,  -8,  87, -40,  81, -68,  73, -85,  // 0
+ 25,  33,  68,  81,  88,  85,  81,  40, -88,  17, -68,  73, -25,  88,  25,  55,
+ 40,  48,  88,  88,  62,  25, -17, -68,  87, -25,  33, -88, -48, -48, -88,  48,
+ 55,  62,  81,  68, -17, -55, -88, -73, -85,  33,   8,  85,  88, -25,  33, -87,
+ 68,  73,  48,  25, -81, -88, -25,  33,  81, -40, -48, -62, -68,  81,  68,   8,
+ 77,  81,   0, -25, -77, -48,  77,  88, -77,  48,  77,  25,   0, -81, -77,  81,
+ 85,  87, -48, -68,  -8,  33,  62,   8,  73, -55, -88,  17,  68,  25, -17, -62,
+ 88,  88, -81, -88,  68,  87, -48, -85, -68,  62,  81, -55, -88,  48,  88, -40,
+ 68,  88,  77,  77,  85,  55,  88,  25,  62, -88,  48, -81,  33, -62,  17, -33,  // 8
+ 48, -25,   0, -77, -48, -87, -81, -48,  68,  -8,  88, -68,  81, -88,  48, -62,
+-81, -81, -77,   0,  -8,  81,  68,  68, -55,  88,  25,  25,  85, -68,  73, -81,
+-25,  48,  77,  77,  62, -40, -48, -81, -73,  17, -68,  88,  40,  -8,  87, -88,
+ 88,  68,   0, -77, -88, -17,  25,  88,  48, -87, -81,  48, -25,  55,  88, -85,
+  0, -68, -77,   0,  77,  68,   0, -88,  77, -25,   0, -48, -77,  88,  77, -68,
+-88, -48,  77,  77, -33, -88, -25,  81, -40,  85,  81, -88, -87,  73,  55, -40,
+ 25,  81,   0, -77, -25,  73,  48, -68, -81,  33,  68, -25, -48,  17,  25,  -8,
+};
+
+ALIGNED(32) const int16_t  ff_dct8_16xN_coeff_hor[256] = {
+ 88,  88,  88,  81,  87,  68,  85,  48,  62, -68,  55, -81,  48, -88,  40, -88,  // 0
+ 87,  85,  68,  48,  33,  -8,  -8, -62, -55,  73, -17,  88,  25,  68,  62,  17,
+ 81,  77,  25,   0, -48, -77, -88, -77,  48, -77, -25, -77, -81,   0, -81,  77,
+ 73,  68, -25, -48, -88, -81, -33,  25, -40,  81,  62,  48,  81, -68,  -8, -68,
+ 62,  55, -68, -81, -55, -17,  73,  88,  33, -85, -85,  -8, -25,  88,  87, -33,
+ 48,  40, -88, -88,  25,  62,  68,  17, -25,  87,  88, -33, -48, -48, -48,  88,
+ 33,  25, -81, -68,  85,  88, -40, -81,  17, -88, -73,  68,  88, -25, -55, -25,
+ 17,   8, -48, -25,  73,  40, -87, -55,  -8,  88,  40, -87, -68,  81,  85, -73,
+ 81,  25,  77,   0,  73, -25,  68, -48,  33, -81,  25, -68,  17, -48,   8, -25,  // 8
+-48, -88, -77, -77, -88, -33, -81,  25,  85, -40,  88, -81,  73, -87,  40, -55,
+-68,   0,   0,  77,  68,  77,  88,   0, -25,  77,  48,   0,  88, -77,  68, -77,
+ 68,  88,  77,   0, -17, -88, -88, -25, -87,  48, -48,  81,  55, -25,  85, -88,
+ 48, -25, -77, -77, -40,  62,  81,  48,  17, -73, -88,  68,  -8,  40,  88, -87,
+-81, -81,   0,  77,  81,  -8, -68, -68,  88, -55, -25, -25, -68,  85,  81, -73,
+-25,  48,  77,   0, -87, -48,  48,  81,  -8,  68,  68, -88, -88,  81,  62, -48,
+ 88,  68, -77, -77,  55,  85, -25, -88, -88,  62,  81, -48, -62,  33,  33, -17,
+};
+
+
+            const int16_t* ff_dct2_16x2_coeff_ver = ff_dct2_2xN_coeff_hor; // This is identical to existing table
+
+
+ALIGNED(32) const int16_t  fi_dct2_16x2_coeff_hor[512] = {
+ 64,  90,  89,  87,  83,  80,  75,  70,  64,  90,  89,  87,  83,  80,  75,  70,  // 0
+ 64,  57,  50,  43,  36,  25,  18,   9,  64,  57,  50,  43,  36,  25,  18,   9,
+ 64,  87,  75,  57,  36,   9, -18, -43,  64,  87,  75,  57,  36,   9, -18, -43,
+-64, -80, -89, -90, -83, -70, -50, -25, -64, -80, -89, -90, -83, -70, -50, -25,
+ 64,  80,  50,   9, -36, -70, -89, -87,  64,  80,  50,   9, -36, -70, -89, -87,
+-64, -25,  18,  57,  83,  90,  75,  43, -64, -25,  18,  57,  83,  90,  75,  43,
+ 64,  70,  18, -43, -83, -87, -50,   9,  64,  70,  18, -43, -83, -87, -50,   9,
+ 64,  90,  75,  25, -36, -80, -89, -57,  64,  90,  75,  25, -36, -80, -89, -57,
+ 64,  57, -18, -80, -83, -25,  50,  90,  64,  57, -18, -80, -83, -25,  50,  90,  // 8
+ 64,  -9, -75, -87, -36,  43,  89,  70,  64,  -9, -75, -87, -36,  43,  89,  70,
+ 64,  43, -50, -90, -36,  57,  89,  25,  64,  43, -50, -90, -36,  57,  89,  25,
+-64, -87, -18,  70,  83,   9, -75, -80, -64, -87, -18,  70,  83,   9, -75, -80,
+ 64,  25, -75, -70,  36,  90,  18, -80,  64,  25, -75, -70,  36,  90,  18, -80,
+-64,  43,  89,   9, -83, -57,  50,  87, -64,  43,  89,   9, -83, -57,  50,  87,
+ 64,   9, -89, -25,  83,  43, -75, -57,  64,   9, -89, -25,  83,  43, -75, -57,
+ 64,  70, -50, -80,  36,  87, -18, -90,  64,  70, -50, -80,  36,  87, -18, -90,
+ 64,  -9, -89,  25,  83, -43, -75,  57,  64,  -9, -89,  25,  83, -43, -75,  57,  // 16
+ 64, -70, -50,  80,  36, -87, -18,  90,  64, -70, -50,  80,  36, -87, -18,  90,
+ 64, -25, -75,  70,  36, -90,  18,  80,  64, -25, -75,  70,  36, -90,  18,  80,
+-64, -43,  89,  -9, -83,  57,  50, -87, -64, -43,  89,  -9, -83,  57,  50, -87,
+ 64, -43, -50,  90, -36, -57,  89, -25,  64, -43, -50,  90, -36, -57,  89, -25,
+-64,  87, -18, -70,  83,  -9, -75,  80, -64,  87, -18, -70,  83,  -9, -75,  80,
+ 64, -57, -18,  80, -83,  25,  50, -90,  64, -57, -18,  80, -83,  25,  50, -90,
+ 64,   9, -75,  87, -36, -43,  89, -70,  64,   9, -75,  87, -36, -43,  89, -70,
+ 64, -70,  18,  43, -83,  87, -50,  -9,  64, -70,  18,  43, -83,  87, -50,  -9,  // 24
+ 64, -90,  75, -25, -36,  80, -89,  57,  64, -90,  75, -25, -36,  80, -89,  57,
+ 64, -80,  50,  -9, -36,  70, -89,  87,  64, -80,  50,  -9, -36,  70, -89,  87,
+-64,  25,  18, -57,  83, -90,  75, -43, -64,  25,  18, -57,  83, -90,  75, -43,
+ 64, -87,  75, -57,  36,  -9, -18,  43,  64, -87,  75, -57,  36,  -9, -18,  43,
+-64,  80, -89,  90, -83,  70, -50,  25, -64,  80, -89,  90, -83,  70, -50,  25,
+ 64, -90,  89, -87,  83, -80,  75, -70,  64, -90,  89, -87,  83, -80,  75, -70,
+ 64, -57,  50, -43,  36, -25,  18,  -9,  64, -57,  50, -43,  36, -25,  18,  -9,
+};
+
+            const int16_t* fi_dct2_16x2_coeff_ver = ff_dct2_2xN_coeff_hor; // This is identical to existing table
+
+
+ALIGNED(32) const int16_t  ff_dct2_16x8_butterfly_o_row_coeff_hor[1024] = {
+ 90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  // 0
+ 87, -87,  87, -87,  87, -87,  87, -87,  87, -87,  87, -87,  87, -87,  87, -87,
+ 80, -80,  80, -80,  80, -80,  80, -80,  80, -80,  80, -80,  80, -80,  80, -80,
+ 70, -70,  70, -70,  70, -70,  70, -70,  70, -70,  70, -70,  70, -70,  70, -70,
+ 57, -57,  57, -57,  57, -57,  57, -57,  57, -57,  57, -57,  57, -57,  57, -57,
+ 43, -43,  43, -43,  43, -43,  43, -43,  43, -43,  43, -43,  43, -43,  43, -43,
+ 25, -25,  25, -25,  25, -25,  25, -25,  25, -25,  25, -25,  25, -25,  25, -25,
+  9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,
+ 87, -87,  87, -87,  87, -87,  87, -87,  87, -87,  87, -87,  87, -87,  87, -87,  // 8
+ 57, -57,  57, -57,  57, -57,  57, -57,  57, -57,  57, -57,  57, -57,  57, -57,
+  9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,
+-43,  43, -43,  43, -43,  43, -43,  43, -43,  43, -43,  43, -43,  43, -43,  43,
+-80,  80, -80,  80, -80,  80, -80,  80, -80,  80, -80,  80, -80,  80, -80,  80,
+-90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90,
+-70,  70, -70,  70, -70,  70, -70,  70, -70,  70, -70,  70, -70,  70, -70,  70,
+-25,  25, -25,  25, -25,  25, -25,  25, -25,  25, -25,  25, -25,  25, -25,  25,
+ 80, -80,  80, -80,  80, -80,  80, -80,  80, -80,  80, -80,  80, -80,  80, -80,  // 16
+  9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,
+-70,  70, -70,  70, -70,  70, -70,  70, -70,  70, -70,  70, -70,  70, -70,  70,
+-87,  87, -87,  87, -87,  87, -87,  87, -87,  87, -87,  87, -87,  87, -87,  87,
+-25,  25, -25,  25, -25,  25, -25,  25, -25,  25, -25,  25, -25,  25, -25,  25,
+ 57, -57,  57, -57,  57, -57,  57, -57,  57, -57,  57, -57,  57, -57,  57, -57,
+ 90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,
+ 43, -43,  43, -43,  43, -43,  43, -43,  43, -43,  43, -43,  43, -43,  43, -43,
+ 70, -70,  70, -70,  70, -70,  70, -70,  70, -70,  70, -70,  70, -70,  70, -70,  // 24
+-43,  43, -43,  43, -43,  43, -43,  43, -43,  43, -43,  43, -43,  43, -43,  43,
+-87,  87, -87,  87, -87,  87, -87,  87, -87,  87, -87,  87, -87,  87, -87,  87,
+  9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,
+ 90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,
+ 25, -25,  25, -25,  25, -25,  25, -25,  25, -25,  25, -25,  25, -25,  25, -25,
+-80,  80, -80,  80, -80,  80, -80,  80, -80,  80, -80,  80, -80,  80, -80,  80,
+-57,  57, -57,  57, -57,  57, -57,  57, -57,  57, -57,  57, -57,  57, -57,  57,
+ 57, -57,  57, -57,  57, -57,  57, -57,  57, -57,  57, -57,  57, -57,  57, -57,  // 32
+-80,  80, -80,  80, -80,  80, -80,  80, -80,  80, -80,  80, -80,  80, -80,  80,
+-25,  25, -25,  25, -25,  25, -25,  25, -25,  25, -25,  25, -25,  25, -25,  25,
+ 90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,
+ -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,
+-87,  87, -87,  87, -87,  87, -87,  87, -87,  87, -87,  87, -87,  87, -87,  87,
+ 43, -43,  43, -43,  43, -43,  43, -43,  43, -43,  43, -43,  43, -43,  43, -43,
+ 70, -70,  70, -70,  70, -70,  70, -70,  70, -70,  70, -70,  70, -70,  70, -70,
+ 43, -43,  43, -43,  43, -43,  43, -43,  43, -43,  43, -43,  43, -43,  43, -43,  // 40
+-90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90,
+ 57, -57,  57, -57,  57, -57,  57, -57,  57, -57,  57, -57,  57, -57,  57, -57,
+ 25, -25,  25, -25,  25, -25,  25, -25,  25, -25,  25, -25,  25, -25,  25, -25,
+-87,  87, -87,  87, -87,  87, -87,  87, -87,  87, -87,  87, -87,  87, -87,  87,
+ 70, -70,  70, -70,  70, -70,  70, -70,  70, -70,  70, -70,  70, -70,  70, -70,
+  9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,
+-80,  80, -80,  80, -80,  80, -80,  80, -80,  80, -80,  80, -80,  80, -80,  80,
+ 25, -25,  25, -25,  25, -25,  25, -25,  25, -25,  25, -25,  25, -25,  25, -25,  // 48
+-70,  70, -70,  70, -70,  70, -70,  70, -70,  70, -70,  70, -70,  70, -70,  70,
+ 90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,
+-80,  80, -80,  80, -80,  80, -80,  80, -80,  80, -80,  80, -80,  80, -80,  80,
+ 43, -43,  43, -43,  43, -43,  43, -43,  43, -43,  43, -43,  43, -43,  43, -43,
+  9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,
+-57,  57, -57,  57, -57,  57, -57,  57, -57,  57, -57,  57, -57,  57, -57,  57,
+ 87, -87,  87, -87,  87, -87,  87, -87,  87, -87,  87, -87,  87, -87,  87, -87,
+  9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,  // 56
+-25,  25, -25,  25, -25,  25, -25,  25, -25,  25, -25,  25, -25,  25, -25,  25,
+ 43, -43,  43, -43,  43, -43,  43, -43,  43, -43,  43, -43,  43, -43,  43, -43,
+-57,  57, -57,  57, -57,  57, -57,  57, -57,  57, -57,  57, -57,  57, -57,  57,
+ 70, -70,  70, -70,  70, -70,  70, -70,  70, -70,  70, -70,  70, -70,  70, -70,
+-80,  80, -80,  80, -80,  80, -80,  80, -80,  80, -80,  80, -80,  80, -80,  80,
+ 87, -87,  87, -87,  87, -87,  87, -87,  87, -87,  87, -87,  87, -87,  87, -87,
+-90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90,
+};
+
+
+ALIGNED(32) const int16_t  ff_dct2_16x4_coeff_ver[128] = {
+ 64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,
+ 64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,
+ 83,  36,  83,  36,  83,  36,  83,  36,  83,  36,  83,  36,  83,  36,  83,  36,
+-36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83,
+ 64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,
+-64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64,
+ 36, -83,  36, -83,  36, -83,  36, -83,  36, -83,  36, -83,  36, -83,  36, -83,
+ 83, -36,  83, -36,  83, -36,  83, -36,  83, -36,  83, -36,  83, -36,  83, -36,
+};
+
+ALIGNED(32) const int16_t  ff_dst7_16x4_coeff_ver[128] = {
+ 29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,
+ 74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,
+ 74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,
+  0, -74,   0, -74,   0, -74,   0, -74,   0, -74,   0, -74,   0, -74,   0, -74,
+ 84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,
+-74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55,
+ 55, -84,  55, -84,  55, -84,  55, -84,  55, -84,  55, -84,  55, -84,  55, -84,
+ 74, -29,  74, -29,  74, -29,  74, -29,  74, -29,  74, -29,  74, -29,  74, -29,
+};
+
+ALIGNED(32) const int16_t  ff_dct8_16x4_coeff_ver[128] = {
+ 84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,
+ 55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,
+ 74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,
+-74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74,
+ 55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,
+-29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84,
+ 29, -74,  29, -74,  29, -74,  29, -74,  29, -74,  29, -74,  29, -74,  29, -74,
+ 84, -55,  84, -55,  84, -55,  84, -55,  84, -55,  84, -55,  84, -55,  84, -55,
+};
+
+
+ALIGNED(32) const int16_t  fi_dct2_16x4_coeff_hor[1024] = {
+ 64,  90,  89,  87,  64,  90,  89,  87,  64,  90,  89,  87,  64,  90,  89,  87,  // 0
+ 83,  80,  75,  70,  83,  80,  75,  70,  83,  80,  75,  70,  83,  80,  75,  70,
+ 64,  57,  50,  43,  64,  57,  50,  43,  64,  57,  50,  43,  64,  57,  50,  43,
+ 36,  25,  18,   9,  36,  25,  18,   9,  36,  25,  18,   9,  36,  25,  18,   9,
+ 64,  87,  75,  57,  64,  87,  75,  57,  64,  87,  75,  57,  64,  87,  75,  57,
+ 36,   9, -18, -43,  36,   9, -18, -43,  36,   9, -18, -43,  36,   9, -18, -43,
+-64, -80, -89, -90, -64, -80, -89, -90, -64, -80, -89, -90, -64, -80, -89, -90,
+-83, -70, -50, -25, -83, -70, -50, -25, -83, -70, -50, -25, -83, -70, -50, -25,
+ 64,  80,  50,   9,  64,  80,  50,   9,  64,  80,  50,   9,  64,  80,  50,   9,  // 8
+-36, -70, -89, -87, -36, -70, -89, -87, -36, -70, -89, -87, -36, -70, -89, -87,
+-64, -25,  18,  57, -64, -25,  18,  57, -64, -25,  18,  57, -64, -25,  18,  57,
+ 83,  90,  75,  43,  83,  90,  75,  43,  83,  90,  75,  43,  83,  90,  75,  43,
+ 64,  70,  18, -43,  64,  70,  18, -43,  64,  70,  18, -43,  64,  70,  18, -43,
+-83, -87, -50,   9, -83, -87, -50,   9, -83, -87, -50,   9, -83, -87, -50,   9,
+ 64,  90,  75,  25,  64,  90,  75,  25,  64,  90,  75,  25,  64,  90,  75,  25,
+-36, -80, -89, -57, -36, -80, -89, -57, -36, -80, -89, -57, -36, -80, -89, -57,
+ 64,  57, -18, -80,  64,  57, -18, -80,  64,  57, -18, -80,  64,  57, -18, -80,  // 16
+-83, -25,  50,  90, -83, -25,  50,  90, -83, -25,  50,  90, -83, -25,  50,  90,
+ 64,  -9, -75, -87,  64,  -9, -75, -87,  64,  -9, -75, -87,  64,  -9, -75, -87,
+-36,  43,  89,  70, -36,  43,  89,  70, -36,  43,  89,  70, -36,  43,  89,  70,
+ 64,  43, -50, -90,  64,  43, -50, -90,  64,  43, -50, -90,  64,  43, -50, -90,
+-36,  57,  89,  25, -36,  57,  89,  25, -36,  57,  89,  25, -36,  57,  89,  25,
+-64, -87, -18,  70, -64, -87, -18,  70, -64, -87, -18,  70, -64, -87, -18,  70,
+ 83,   9, -75, -80,  83,   9, -75, -80,  83,   9, -75, -80,  83,   9, -75, -80,
+ 64,  25, -75, -70,  64,  25, -75, -70,  64,  25, -75, -70,  64,  25, -75, -70,  // 24
+ 36,  90,  18, -80,  36,  90,  18, -80,  36,  90,  18, -80,  36,  90,  18, -80,
+-64,  43,  89,   9, -64,  43,  89,   9, -64,  43,  89,   9, -64,  43,  89,   9,
+-83, -57,  50,  87, -83, -57,  50,  87, -83, -57,  50,  87, -83, -57,  50,  87,
+ 64,   9, -89, -25,  64,   9, -89, -25,  64,   9, -89, -25,  64,   9, -89, -25,
+ 83,  43, -75, -57,  83,  43, -75, -57,  83,  43, -75, -57,  83,  43, -75, -57,
+ 64,  70, -50, -80,  64,  70, -50, -80,  64,  70, -50, -80,  64,  70, -50, -80,
+ 36,  87, -18, -90,  36,  87, -18, -90,  36,  87, -18, -90,  36,  87, -18, -90,
+ 64,  -9, -89,  25,  64,  -9, -89,  25,  64,  -9, -89,  25,  64,  -9, -89,  25,  // 32
+ 83, -43, -75,  57,  83, -43, -75,  57,  83, -43, -75,  57,  83, -43, -75,  57,
+ 64, -70, -50,  80,  64, -70, -50,  80,  64, -70, -50,  80,  64, -70, -50,  80,
+ 36, -87, -18,  90,  36, -87, -18,  90,  36, -87, -18,  90,  36, -87, -18,  90,
+ 64, -25, -75,  70,  64, -25, -75,  70,  64, -25, -75,  70,  64, -25, -75,  70,
+ 36, -90,  18,  80,  36, -90,  18,  80,  36, -90,  18,  80,  36, -90,  18,  80,
+-64, -43,  89,  -9, -64, -43,  89,  -9, -64, -43,  89,  -9, -64, -43,  89,  -9,
+-83,  57,  50, -87, -83,  57,  50, -87, -83,  57,  50, -87, -83,  57,  50, -87,
+ 64, -43, -50,  90,  64, -43, -50,  90,  64, -43, -50,  90,  64, -43, -50,  90,  // 40
+-36, -57,  89, -25, -36, -57,  89, -25, -36, -57,  89, -25, -36, -57,  89, -25,
+-64,  87, -18, -70, -64,  87, -18, -70, -64,  87, -18, -70, -64,  87, -18, -70,
+ 83,  -9, -75,  80,  83,  -9, -75,  80,  83,  -9, -75,  80,  83,  -9, -75,  80,
+ 64, -57, -18,  80,  64, -57, -18,  80,  64, -57, -18,  80,  64, -57, -18,  80,
+-83,  25,  50, -90, -83,  25,  50, -90, -83,  25,  50, -90, -83,  25,  50, -90,
+ 64,   9, -75,  87,  64,   9, -75,  87,  64,   9, -75,  87,  64,   9, -75,  87,
+-36, -43,  89, -70, -36, -43,  89, -70, -36, -43,  89, -70, -36, -43,  89, -70,
+ 64, -70,  18,  43,  64, -70,  18,  43,  64, -70,  18,  43,  64, -70,  18,  43,  // 48
+-83,  87, -50,  -9, -83,  87, -50,  -9, -83,  87, -50,  -9, -83,  87, -50,  -9,
+ 64, -90,  75, -25,  64, -90,  75, -25,  64, -90,  75, -25,  64, -90,  75, -25,
+-36,  80, -89,  57, -36,  80, -89,  57, -36,  80, -89,  57, -36,  80, -89,  57,
+ 64, -80,  50,  -9,  64, -80,  50,  -9,  64, -80,  50,  -9,  64, -80,  50,  -9,
+-36,  70, -89,  87, -36,  70, -89,  87, -36,  70, -89,  87, -36,  70, -89,  87,
+-64,  25,  18, -57, -64,  25,  18, -57, -64,  25,  18, -57, -64,  25,  18, -57,
+ 83, -90,  75, -43,  83, -90,  75, -43,  83, -90,  75, -43,  83, -90,  75, -43,
+ 64, -87,  75, -57,  64, -87,  75, -57,  64, -87,  75, -57,  64, -87,  75, -57,  // 56
+ 36,  -9, -18,  43,  36,  -9, -18,  43,  36,  -9, -18,  43,  36,  -9, -18,  43,
+-64,  80, -89,  90, -64,  80, -89,  90, -64,  80, -89,  90, -64,  80, -89,  90,
+-83,  70, -50,  25, -83,  70, -50,  25, -83,  70, -50,  25, -83,  70, -50,  25,
+ 64, -90,  89, -87,  64, -90,  89, -87,  64, -90,  89, -87,  64, -90,  89, -87,
+ 83, -80,  75, -70,  83, -80,  75, -70,  83, -80,  75, -70,  83, -80,  75, -70,
+ 64, -57,  50, -43,  64, -57,  50, -43,  64, -57,  50, -43,  64, -57,  50, -43,
+ 36, -25,  18,  -9,  36, -25,  18,  -9,  36, -25,  18,  -9,  36, -25,  18,  -9,
+};
+
+ALIGNED(32) const int16_t  fi_dst7_16x4_coeff_hor[1024] = {
+  8,  25,  40,  55,   8,  25,  40,  55,   8,  25,  40,  55,   8,  25,  40,  55,  // 0
+ 68,  77,  85,  88,  68,  77,  85,  88,  68,  77,  85,  88,  68,  77,  85,  88,
+ 88,  87,  81,  73,  88,  87,  81,  73,  88,  87,  81,  73,  88,  87,  81,  73,
+ 62,  48,  33,  17,  62,  48,  33,  17,  62,  48,  33,  17,  62,  48,  33,  17,
+ 17,  48,  73,  87,  17,  48,  73,  87,  17,  48,  73,  87,  17,  48,  73,  87,
+ 88,  77,  55,  25,  88,  77,  55,  25,  88,  77,  55,  25,  88,  77,  55,  25,
+ -8, -40, -68, -85,  -8, -40, -68, -85,  -8, -40, -68, -85,  -8, -40, -68, -85,
+-88, -81, -62, -33, -88, -81, -62, -33, -88, -81, -62, -33, -88, -81, -62, -33,
+ 25,  68,  88,  81,  25,  68,  88,  81,  25,  68,  88,  81,  25,  68,  88,  81,  // 8
+ 48,   0, -48, -81,  48,   0, -48, -81,  48,   0, -48, -81,  48,   0, -48, -81,
+-88, -68, -25,  25, -88, -68, -25,  25, -88, -68, -25,  25, -88, -68, -25,  25,
+ 68,  88,  81,  48,  68,  88,  81,  48,  68,  88,  81,  48,  68,  88,  81,  48,
+ 33,  81,  85,  40,  33,  81,  85,  40,  33,  81,  85,  40,  33,  81,  85,  40,
+-25, -77, -87, -48, -25, -77, -87, -48, -25, -77, -87, -48, -25, -77, -87, -48,
+ 17,  73,  88,  55,  17,  73,  88,  55,  17,  73,  88,  55,  17,  73,  88,  55,
+ -8, -68, -88, -62,  -8, -68, -88, -62,  -8, -68, -88, -62,  -8, -68, -88, -62,
+ 40,  88,  62, -17,  40,  88,  62, -17,  40,  88,  62, -17,  40,  88,  62, -17,  // 16
+-81, -77,  -8,  68, -81, -77,  -8,  68, -81, -77,  -8,  68, -81, -77,  -8,  68,
+ 87,  33, -48, -88,  87,  33, -48, -88,  87,  33, -48, -88,  87,  33, -48, -88,
+-55,  25,  85,  73, -55,  25,  85,  73, -55,  25,  85,  73, -55,  25,  85,  73,
+ 48,  88,  25, -68,  48,  88,  25, -68,  48,  88,  25, -68,  48,  88,  25, -68,
+-81,   0,  81,  68, -81,   0,  81,  68, -81,   0,  81,  68, -81,   0,  81,  68,
+-25, -88, -48,  48, -25, -88, -48,  48, -25, -88, -48,  48, -25, -88, -48,  48,
+ 88,  25, -68, -81,  88,  25, -68, -81,  88,  25, -68, -81,  88,  25, -68, -81,
+ 55,  81, -17, -88,  55,  81, -17, -88,  55,  81, -17, -88,  55,  81, -17, -88,  // 24
+-25,  77,  62, -48, -25,  77,  62, -48, -25,  77,  62, -48, -25,  77,  62, -48,
+-85,   8,  88,  33, -85,   8,  88,  33, -85,   8,  88,  33, -85,   8,  88,  33,
+-73, -68,  40,  87, -73, -68,  40,  87, -73, -68,  40,  87, -73, -68,  40,  87,
+ 62,  68, -55, -73,  62,  68, -55, -73,  62,  68, -55, -73,  62,  68, -55, -73,
+ 48,  77, -40, -81,  48,  77, -40, -81,  48,  77, -40, -81,  48,  77, -40, -81,
+ 33,  85, -25, -87,  33,  85, -25, -87,  33,  85, -25, -87,  33,  85, -25, -87,
+ 17,  88,  -8, -88,  17,  88,  -8, -88,  17,  88,  -8, -88,  17,  88,  -8, -88,
+ 68,  48, -81, -25,  68,  48, -81, -25,  68,  48, -81, -25,  68,  48, -81, -25,  // 32
+ 88,   0, -88,  25,  88,   0, -88,  25,  88,   0, -88,  25,  88,   0, -88,  25,
+ 81, -48, -68,  68,  81, -48, -68,  68,  81, -48, -68,  68,  81, -48, -68,  68,
+ 48, -81, -25,  88,  48, -81, -25,  88,  48, -81, -25,  88,  48, -81, -25,  88,
+ 73,  25, -88,  33,  73,  25, -88,  33,  73,  25, -88,  33,  73,  25, -88,  33,
+ 68, -77, -17,  88,  68, -77, -17,  88,  68, -77, -17,  88,  68, -77, -17,  88,
+-40, -62,  81,   8, -40, -62,  81,   8, -40, -62,  81,   8, -40, -62,  81,   8,
+-87,  48,  55, -85, -87,  48,  55, -85, -87,  48,  55, -85, -87,  48,  55, -85,
+ 77,   0, -77,  77,  77,   0, -77,  77,  77,   0, -77,  77,  77,   0, -77,  77,  // 40
+  0, -77,  77,   0,   0, -77,  77,   0,   0, -77,  77,   0,   0, -77,  77,   0,
+-77,  77,   0, -77, -77,  77,   0, -77, -77,  77,   0, -77, -77,  77,   0, -77,
+ 77,   0, -77,  77,  77,   0, -77,  77,  77,   0, -77,  77,  77,   0, -77,  77,
+ 81, -25, -48,  88,  81, -25, -48,  88,  81, -25, -48,  88,  81, -25, -48,  88,
+-68,   0,  68, -88, -68,   0,  68, -88, -68,   0,  68, -88, -68,   0,  68, -88,
+ 48,  25, -81,  81,  48,  25, -81,  81,  48,  25, -81,  81,  48,  25, -81,  81,
+-25, -48,  88, -68, -25, -48,  88, -68, -25, -48,  88, -68, -25, -48,  88, -68,
+ 85, -48,  -8,  62,  85, -48,  -8,  62,  85, -48,  -8,  62,  85, -48,  -8,  62,  // 48
+-88,  77, -33, -25, -88,  77, -33, -25, -88,  77, -33, -25, -88,  77, -33, -25,
+ 73, -88,  68, -17,  73, -88,  68, -17,  73, -88,  68, -17,  73, -88,  68, -17,
+-40,  81, -87,  55, -40,  81, -87,  55, -40,  81, -87,  55, -40,  81, -87,  55,
+ 87, -68,  33,   8,  87, -68,  33,   8,  87, -68,  33,   8,  87, -68,  33,   8,
+-48,  77, -88,  81, -48,  77, -88,  81, -48,  77, -88,  81, -48,  77, -88,  81,
+-55,  17,  25, -62, -55,  17,  25, -62, -55,  17,  25, -62, -55,  17,  25, -62,
+ 85, -88,  73, -40,  85, -88,  73, -40,  85, -88,  73, -40,  85, -88,  73, -40,
+ 88, -81,  68, -48,  88, -81,  68, -48,  88, -81,  68, -48,  88, -81,  68, -48,  // 56
+ 25,   0, -25,  48,  25,   0, -25,  48,  25,   0, -25,  48,  25,   0, -25,  48,
+-68,  81, -88,  88, -68,  81, -88,  88, -68,  81, -88,  88, -68,  81, -88,  88,
+-81,  68, -48,  25, -81,  68, -48,  25, -81,  68, -48,  25, -81,  68, -48,  25,
+ 88, -88,  87, -85,  88, -88,  87, -85,  88, -88,  87, -85,  88, -88,  87, -85,
+ 81, -77,  73, -68,  81, -77,  73, -68,  81, -77,  73, -68,  81, -77,  73, -68,
+ 62, -55,  48, -40,  62, -55,  48, -40,  62, -55,  48, -40,  62, -55,  48, -40,
+ 33, -25,  17,  -8,  33, -25,  17,  -8,  33, -25,  17,  -8,  33, -25,  17,  -8,
+};
+
+ALIGNED(32) const int16_t  fi_dct8_16x4_coeff_hor[1024] = {
+ 88,  88,  87,  85,  88,  88,  87,  85,  88,  88,  87,  85,  88,  88,  87,  85,  // 0
+ 81,  77,  73,  68,  81,  77,  73,  68,  81,  77,  73,  68,  81,  77,  73,  68,
+ 62,  55,  48,  40,  62,  55,  48,  40,  62,  55,  48,  40,  62,  55,  48,  40,
+ 33,  25,  17,   8,  33,  25,  17,   8,  33,  25,  17,   8,  33,  25,  17,   8,
+ 88,  81,  68,  48,  88,  81,  68,  48,  88,  81,  68,  48,  88,  81,  68,  48,
+ 25,   0, -25, -48,  25,   0, -25, -48,  25,   0, -25, -48,  25,   0, -25, -48,
+-68, -81, -88, -88, -68, -81, -88, -88, -68, -81, -88, -88, -68, -81, -88, -88,
+-81, -68, -48, -25, -81, -68, -48, -25, -81, -68, -48, -25, -81, -68, -48, -25,
+ 87,  68,  33,  -8,  87,  68,  33,  -8,  87,  68,  33,  -8,  87,  68,  33,  -8,  // 8
+-48, -77, -88, -81, -48, -77, -88, -81, -48, -77, -88, -81, -48, -77, -88, -81,
+-55, -17,  25,  62, -55, -17,  25,  62, -55, -17,  25,  62, -55, -17,  25,  62,
+ 85,  88,  73,  40,  85,  88,  73,  40,  85,  88,  73,  40,  85,  88,  73,  40,
+ 85,  48,  -8, -62,  85,  48,  -8, -62,  85,  48,  -8, -62,  85,  48,  -8, -62,
+-88, -77, -33,  25, -88, -77, -33,  25, -88, -77, -33,  25, -88, -77, -33,  25,
+ 73,  88,  68,  17,  73,  88,  68,  17,  73,  88,  68,  17,  73,  88,  68,  17,
+-40, -81, -87, -55, -40, -81, -87, -55, -40, -81, -87, -55, -40, -81, -87, -55,
+ 81,  25, -48, -88,  81,  25, -48, -88,  81,  25, -48, -88,  81,  25, -48, -88,  // 16
+-68,   0,  68,  88, -68,   0,  68,  88, -68,   0,  68,  88, -68,   0,  68,  88,
+ 48, -25, -81, -81,  48, -25, -81, -81,  48, -25, -81, -81,  48, -25, -81, -81,
+-25,  48,  88,  68, -25,  48,  88,  68, -25,  48,  88,  68, -25,  48,  88,  68,
+ 77,   0, -77, -77,  77,   0, -77, -77,  77,   0, -77, -77,  77,   0, -77, -77,
+  0,  77,  77,   0,   0,  77,  77,   0,   0,  77,  77,   0,   0,  77,  77,   0,
+-77, -77,   0,  77, -77, -77,   0,  77, -77, -77,   0,  77, -77, -77,   0,  77,
+ 77,   0, -77, -77,  77,   0, -77, -77,  77,   0, -77, -77,  77,   0, -77, -77,
+ 73, -25, -88, -33,  73, -25, -88, -33,  73, -25, -88, -33,  73, -25, -88, -33,  // 24
+ 68,  77, -17, -88,  68,  77, -17, -88,  68,  77, -17, -88,  68,  77, -17, -88,
+-40,  62,  81,  -8, -40,  62,  81,  -8, -40,  62,  81,  -8, -40,  62,  81,  -8,
+-87, -48,  55,  85, -87, -48,  55,  85, -87, -48,  55,  85, -87, -48,  55,  85,
+ 68, -48, -81,  25,  68, -48, -81,  25,  68, -48, -81,  25,  68, -48, -81,  25,
+ 88,   0, -88, -25,  88,   0, -88, -25,  88,   0, -88, -25,  88,   0, -88, -25,
+ 81,  48, -68, -68,  81,  48, -68, -68,  81,  48, -68, -68,  81,  48, -68, -68,
+ 48,  81, -25, -88,  48,  81, -25, -88,  48,  81, -25, -88,  48,  81, -25, -88,
+ 62, -68, -55,  73,  62, -68, -55,  73,  62, -68, -55,  73,  62, -68, -55,  73,  // 32
+ 48, -77, -40,  81,  48, -77, -40,  81,  48, -77, -40,  81,  48, -77, -40,  81,
+ 33, -85, -25,  87,  33, -85, -25,  87,  33, -85, -25,  87,  33, -85, -25,  87,
+ 17, -88,  -8,  88,  17, -88,  -8,  88,  17, -88,  -8,  88,  17, -88,  -8,  88,
+ 55, -81, -17,  88,  55, -81, -17,  88,  55, -81, -17,  88,  55, -81, -17,  88,
+-25, -77,  62,  48, -25, -77,  62,  48, -25, -77,  62,  48, -25, -77,  62,  48,
+-85,  -8,  88, -33, -85,  -8,  88, -33, -85,  -8,  88, -33, -85,  -8,  88, -33,
+-73,  68,  40, -87, -73,  68,  40, -87, -73,  68,  40, -87, -73,  68,  40, -87,
+ 48, -88,  25,  68,  48, -88,  25,  68,  48, -88,  25,  68,  48, -88,  25,  68,  // 40
+-81,   0,  81, -68, -81,   0,  81, -68, -81,   0,  81, -68, -81,   0,  81, -68,
+-25,  88, -48, -48, -25,  88, -48, -48, -25,  88, -48, -48, -25,  88, -48, -48,
+ 88, -25, -68,  81,  88, -25, -68,  81,  88, -25, -68,  81,  88, -25, -68,  81,
+ 40, -88,  62,  17,  40, -88,  62,  17,  40, -88,  62,  17,  40, -88,  62,  17,
+-81,  77,  -8, -68, -81,  77,  -8, -68, -81,  77,  -8, -68, -81,  77,  -8, -68,
+ 87, -33, -48,  88,  87, -33, -48,  88,  87, -33, -48,  88,  87, -33, -48,  88,
+-55, -25,  85, -73, -55, -25,  85, -73, -55, -25,  85, -73, -55, -25,  85, -73,
+ 33, -81,  85, -40,  33, -81,  85, -40,  33, -81,  85, -40,  33, -81,  85, -40,  // 48
+-25,  77, -87,  48, -25,  77, -87,  48, -25,  77, -87,  48, -25,  77, -87,  48,
+ 17, -73,  88, -55,  17, -73,  88, -55,  17, -73,  88, -55,  17, -73,  88, -55,
+ -8,  68, -88,  62,  -8,  68, -88,  62,  -8,  68, -88,  62,  -8,  68, -88,  62,
+ 25, -68,  88, -81,  25, -68,  88, -81,  25, -68,  88, -81,  25, -68,  88, -81,
+ 48,   0, -48,  81,  48,   0, -48,  81,  48,   0, -48,  81,  48,   0, -48,  81,
+-88,  68, -25, -25, -88,  68, -25, -25, -88,  68, -25, -25, -88,  68, -25, -25,
+ 68, -88,  81, -48,  68, -88,  81, -48,  68, -88,  81, -48,  68, -88,  81, -48,
+ 17, -48,  73, -87,  17, -48,  73, -87,  17, -48,  73, -87,  17, -48,  73, -87,  // 56
+ 88, -77,  55, -25,  88, -77,  55, -25,  88, -77,  55, -25,  88, -77,  55, -25,
+ -8,  40, -68,  85,  -8,  40, -68,  85,  -8,  40, -68,  85,  -8,  40, -68,  85,
+-88,  81, -62,  33, -88,  81, -62,  33, -88,  81, -62,  33, -88,  81, -62,  33,
+  8, -25,  40, -55,   8, -25,  40, -55,   8, -25,  40, -55,   8, -25,  40, -55,
+ 68, -77,  85, -88,  68, -77,  85, -88,  68, -77,  85, -88,  68, -77,  85, -88,
+ 88, -87,  81, -73,  88, -87,  81, -73,  88, -87,  81, -73,  88, -87,  81, -73,
+ 62, -48,  33, -17,  62, -48,  33, -17,  62, -48,  33, -17,  62, -48,  33, -17,
+};
+
+
+ALIGNED(32) const int16_t  fi_dct2_16x4_coeff_ver[128] = {
+ 64,  83,  64,  83,  64,  83,  64,  83,  64,  83,  64,  83,  64,  83,  64,  83,
+ 64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,
+ 64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,
+-64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83,
+ 64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,
+-64,  83, -64,  83, -64,  83, -64,  83, -64,  83, -64,  83, -64,  83, -64,  83,
+ 64, -83,  64, -83,  64, -83,  64, -83,  64, -83,  64, -83,  64, -83,  64, -83,
+ 64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,
+};
+
+ALIGNED(32) const int16_t  fi_dst7_16x4_coeff_ver[128] = {
+ 29,  74,  29,  74,  29,  74,  29,  74,  29,  74,  29,  74,  29,  74,  29,  74,
+ 84,  55,  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,
+ 55,  74,  55,  74,  55,  74,  55,  74,  55,  74,  55,  74,  55,  74,  55,  74,
+-29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84,
+ 74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,
+-74,  74, -74,  74, -74,  74, -74,  74, -74,  74, -74,  74, -74,  74, -74,  74,
+ 84, -74,  84, -74,  84, -74,  84, -74,  84, -74,  84, -74,  84, -74,  84, -74,
+ 55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,
+};
+
+ALIGNED(32) const int16_t  fi_dct8_16x4_coeff_ver[128] = {
+ 84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,
+ 55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,
+ 74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,
+-74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74,
+ 55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,
+-29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84,
+ 29, -74,  29, -74,  29, -74,  29, -74,  29, -74,  29, -74,  29, -74,  29, -74,
+ 84, -55,  84, -55,  84, -55,  84, -55,  84, -55,  84, -55,  84, -55,  84, -55,
+};
+
+
+ALIGNED(32) const int16_t  ff_dct2_16x8_coeff_ver[64] = {
+ 64,  64,  89,  75,  83,  36,  75, -18,  64, -64,  50, -89,  36, -83,  18, -50,
+ 64,  64,  50,  18, -36, -83, -89, -50, -64,  64,  18,  75,  83, -36,  75, -89,
+ 64,  64, -18, -50, -83, -36,  50,  89,  64, -64, -75, -18, -36,  83,  89, -75,
+ 64,  64, -75, -89,  36,  83,  18, -75, -64,  64,  89, -50, -83,  36,  50, -18,
+};
+
+ALIGNED(32) const int16_t  ff_dst7_16x8_coeff_ver[64] = {
+ 17,  32,  46,  78,  71,  85,  85,  46,  86, -17,  78, -71,  60, -86,  32, -60,
+ 46,  60,  86,  71,  32, -46, -60, -78, -85,  32, -17,  85,  71, -17,  78, -86,
+ 71,  78,  32, -17, -86, -60,  17,  86,  78, -46, -60, -32, -46,  85,  85, -71,
+ 85,  86, -60, -85,  17,  78,  32, -71, -71,  60,  86, -46, -78,  32,  46, -17,
+};
+
+ALIGNED(32) const int16_t  ff_dct8_16x8_coeff_ver[64] = {
+ 86,  85,  85,  60,  78,  17,  71, -32,  60, -71,  46, -86,  32, -78,  17, -46,
+ 78,  71,  17, -32, -60, -86, -86, -17, -46,  78,  32,  60,  85, -46,  71, -85,
+ 60,  46, -71, -86, -46,  32,  78,  60,  32, -85, -85,  17, -17,  71,  86, -78,
+ 32,  17, -78, -46,  85,  71, -46, -85, -17,  86,  71, -78, -86,  60,  60, -32,
+};
+
+ALIGNED(32) const int16_t  ff_dct2_16x8_butterfly_coeff_ver[128] = {
+   64,  64,  89,  75,  83,  36,  75, -18,  64,  64,  89,  75,  83,  36,  75, -18,
+   64,  64,  50,  18, -36, -83, -89, -50,  64,  64,  50,  18, -36, -83, -89, -50,
+   64,  64, -18, -50, -83, -36,  50,  89,  64,  64, -18, -50, -83, -36,  50,  89,
+   64,  64, -75, -89,  36,  83,  18, -75,  64,  64, -75, -89,  36,  83,  18, -75,
+   64, -64,  50, -89,  36, -83,  18, -50,  64, -64,  50, -89,  36, -83,  18, -50,
+  -64,  64,  18,  75,  83, -36,  75, -89, -64,  64,  18,  75,  83, -36,  75, -89,
+   64, -64, -75, -18, -36,  83,  89, -75,  64, -64, -75, -18, -36,  83,  89, -75,
+  -64,  64,  89, -50, -83,  36,  50, -18, -64,  64,  89, -50, -83,  36,  50, -18
+};
+
+ALIGNED(32) const int16_t  ff_dct2_16x8_butterfly_o_row_coeff_ver[256] = {
+ 89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  // 0
+ 75, -75,  75, -75,  75, -75,  75, -75,  75, -75,  75, -75,  75, -75,  75, -75,
+ 50, -50,  50, -50,  50, -50,  50, -50,  50, -50,  50, -50,  50, -50,  50, -50,
+ 18, -18,  18, -18,  18, -18,  18, -18,  18, -18,  18, -18,  18, -18,  18, -18,
+ 75, -75,  75, -75,  75, -75,  75, -75,  75, -75,  75, -75,  75, -75,  75, -75,
+-18,  18, -18,  18, -18,  18, -18,  18, -18,  18, -18,  18, -18,  18, -18,  18,
+-89,  89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  89,
+-50,  50, -50,  50, -50,  50, -50,  50, -50,  50, -50,  50, -50,  50, -50,  50,
+ 50, -50,  50, -50,  50, -50,  50, -50,  50, -50,  50, -50,  50, -50,  50, -50,  // 8
+-89,  89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  89,
+ 18, -18,  18, -18,  18, -18,  18, -18,  18, -18,  18, -18,  18, -18,  18, -18,
+ 75, -75,  75, -75,  75, -75,  75, -75,  75, -75,  75, -75,  75, -75,  75, -75,
+ 18, -18,  18, -18,  18, -18,  18, -18,  18, -18,  18, -18,  18, -18,  18, -18,
+-50,  50, -50,  50, -50,  50, -50,  50, -50,  50, -50,  50, -50,  50, -50,  50,
+ 75, -75,  75, -75,  75, -75,  75, -75,  75, -75,  75, -75,  75, -75,  75, -75,
+-89,  89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  89,
+};
+
+
+            const int16_t* fi_dct2_16x8_coeff_hor = fi_dct2_8x16_coeff_ver; // Duplicate table.
+           
+            const int16_t* fi_dst7_16x8_coeff_hor = fi_dst7_8x16_coeff_ver; // Duplicate table.
+           
+            const int16_t* fi_dct8_16x8_coeff_hor = fi_dct8_8x16_coeff_ver; // Duplicate table.
+
+
+            const int16_t* fi_dct2_16x8_coeff_ver = fi_dct2_8x8_coeff_hor;  // Duplicate table
+           
+            const int16_t* fi_dst7_16x8_coeff_ver = fi_dst7_8x8_coeff_hor;  // Duplicate table
+           
+            const int16_t* fi_dct8_16x8_coeff_ver = fi_dct8_8x8_coeff_hor;  // Duplicate table
+
+
+ALIGNED(32) const int16_t  ff_dct2_16x16_coeff_ver[256] = {
+ 64,  64,  90,  87,  89,  75,  87,  57,  83,  36,  80,   9,  75, -18,  70, -43,  // 0
+ 64, -64,  57, -80,  50, -89,  43, -90,  36, -83,  25, -70,  18, -50,   9, -25,
+ 64,  64,  80,  70,  50,  18,   9, -43, -36, -83, -70, -87, -89, -50, -87,   9,
+-64,  64, -25,  90,  18,  75,  57,  25,  83, -36,  90, -80,  75, -89,  43, -57,
+ 64,  64,  57,  43, -18, -50, -80, -90, -83, -36, -25,  57,  50,  89,  90,  25,
+ 64, -64,  -9, -87, -75, -18, -87,  70, -36,  83,  43,   9,  89, -75,  70, -80,
+ 64,  64,  25,   9, -75, -89, -70, -25,  36,  83,  90,  43,  18, -75, -80, -57,
+-64,  64,  43,  70,  89, -50,   9, -80, -83,  36, -57,  87,  50, -18,  87, -90,
+ 64,  64,  -9, -25, -89, -75,  25,  70,  83,  36, -43, -90, -75,  18,  57,  80,  // 8
+ 64, -64, -70, -43, -50,  89,  80,  -9,  36, -83, -87,  57, -18,  50,  90, -87,
+ 64,  64, -43, -57, -50, -18,  90,  80, -36, -83, -57,  25,  89,  50, -25, -90,
+-64,  64,  87,   9, -18, -75, -70,  87,  83, -36,  -9, -43, -75,  89,  80, -70,
+ 64,  64, -70, -80,  18,  50,  43,  -9, -83, -36,  87,  70, -50, -89,  -9,  87,
+ 64, -64, -90,  25,  75,  18, -25, -57, -36,  83,  80, -90, -89,  75,  57, -43,
+ 64,  64, -87, -90,  75,  89, -57, -87,  36,  83,  -9, -80, -18,  75,  43, -70,
+-64,  64,  80, -57, -89,  50,  90, -43, -83,  36,  70, -25, -50,  18,  25,  -9,
+};
+
+ALIGNED(32) const int16_t  ff_dst7_16x16_coeff_ver[256] = {
+  8,  17,  25,  48,  40,  73,  55,  87,  68,  88,  77,  77,  85,  55,  88,  25,  // 0
+ 88,  -8,  87, -40,  81, -68,  73, -85,  62, -88,  48, -81,  33, -62,  17, -33,
+ 25,  33,  68,  81,  88,  85,  81,  40,  48, -25,   0, -77, -48, -87, -81, -48,
+-88,  17, -68,  73, -25,  88,  25,  55,  68,  -8,  88, -68,  81, -88,  48, -62,
+ 40,  48,  88,  88,  62,  25, -17, -68, -81, -81, -77,   0,  -8,  81,  68,  68,
+ 87, -25,  33, -88, -48, -48, -88,  48, -55,  88,  25,  25,  85, -68,  73, -81,
+ 55,  62,  81,  68, -17, -55, -88, -73, -25,  48,  77,  77,  62, -40, -48, -81,
+-85,  33,   8,  85,  88, -25,  33, -87, -73,  17, -68,  88,  40,  -8,  87, -88,
+ 68,  73,  48,  25, -81, -88, -25,  33,  88,  68,   0, -77, -88, -17,  25,  88,  // 8
+ 81, -40, -48, -62, -68,  81,  68,   8,  48, -87, -81,  48, -25,  55,  88, -85,
+ 77,  81,   0, -25, -77, -48,  77,  88,   0, -68, -77,   0,  77,  68,   0, -88,
+-77,  48,  77,  25,   0, -81, -77,  81,  77, -25,   0, -48, -77,  88,  77, -68,
+ 85,  87, -48, -68,  -8,  33,  62,   8, -88, -48,  77,  77, -33, -88, -25,  81,
+ 73, -55, -88,  17,  68,  25, -17, -62, -40,  85,  81, -88, -87,  73,  55, -40,
+ 88,  88, -81, -88,  68,  87, -48, -85,  25,  81,   0, -77, -25,  73,  48, -68,
+-68,  62,  81, -55, -88,  48,  88, -40, -81,  33,  68, -25, -48,  17,  25,  -8,
+};
+
+ALIGNED(32) const int16_t  ff_dct8_16x16_coeff_ver[256] = {
+ 88,  88,  88,  81,  87,  68,  85,  48,  81,  25,  77,   0,  73, -25,  68, -48,  // 0
+ 62, -68,  55, -81,  48, -88,  40, -88,  33, -81,  25, -68,  17, -48,   8, -25,
+ 87,  85,  68,  48,  33,  -8,  -8, -62, -48, -88, -77, -77, -88, -33, -81,  25,
+-55,  73, -17,  88,  25,  68,  62,  17,  85, -40,  88, -81,  73, -87,  40, -55,
+ 81,  77,  25,   0, -48, -77, -88, -77, -68,   0,   0,  77,  68,  77,  88,   0,
+ 48, -77, -25, -77, -81,   0, -81,  77, -25,  77,  48,   0,  88, -77,  68, -77,
+ 73,  68, -25, -48, -88, -81, -33,  25,  68,  88,  77,   0, -17, -88, -88, -25,
+-40,  81,  62,  48,  81, -68,  -8, -68, -87,  48, -48,  81,  55, -25,  85, -88,
+ 62,  55, -68, -81, -55, -17,  73,  88,  48, -25, -77, -77, -40,  62,  81,  48,  // 8
+ 33, -85, -85,  -8, -25,  88,  87, -33,  17, -73, -88,  68,  -8,  40,  88, -87,
+ 48,  40, -88, -88,  25,  62,  68,  17, -81, -81,   0,  77,  81,  -8, -68, -68,
+-25,  87,  88, -33, -48, -48, -48,  88,  88, -55, -25, -25, -68,  85,  81, -73,
+ 33,  25, -81, -68,  85,  88, -40, -81, -25,  48,  77,   0, -87, -48,  48,  81,
+ 17, -88, -73,  68,  88, -25, -55, -25,  -8,  68,  68, -88, -88,  81,  62, -48,
+ 17,   8, -48, -25,  73,  40, -87, -55,  88,  68, -77, -77,  55,  85, -25, -88,
+ -8,  88,  40, -87, -68,  81,  85, -73, -88,  62,  81, -48, -62,  33,  33, -17,
+};
+
+
+ALIGNED(32) const int16_t  fi_dct2_16x16_coeff_hor[256] = {
+ 64,  90,  64,  87,  64,  80,  64,  70,  64,  57,  64,  43,  64,  25,  64,   9,  // 0
+ 64,  -9,  64, -25,  64, -43,  64, -57,  64, -70,  64, -80,  64, -87,  64, -90,
+ 89,  87,  75,  57,  50,   9,  18, -43, -18, -80, -50, -90, -75, -70, -89, -25,
+-89,  25, -75,  70, -50,  90, -18,  80,  18,  43,  50,  -9,  75, -57,  89, -87,
+ 83,  80,  36,   9, -36, -70, -83, -87, -83, -25, -36,  57,  36,  90,  83,  43,
+ 83, -43,  36, -90, -36, -57, -83,  25, -83,  87, -36,  70,  36,  -9,  83, -80,
+ 75,  70, -18, -43, -89, -87, -50,   9,  50,  90,  89,  25,  18, -80, -75, -57,
+-75,  57,  18,  80,  89, -25,  50, -90, -50,  -9, -89,  87, -18,  43,  75, -70,
+ 64,  57, -64, -80, -64, -25,  64,  90,  64,  -9, -64, -87, -64,  43,  64,  70,  // 8
+ 64, -70, -64, -43, -64,  87,  64,   9,  64, -90, -64,  25, -64,  80,  64, -57,
+ 50,  43, -89, -90,  18,  57,  75,  25, -75, -87, -18,  70,  89,   9, -50, -80,
+-50,  80,  89,  -9, -18, -70, -75,  87,  75, -25,  18, -57, -89,  90,  50, -43,
+ 36,  25, -83, -70,  83,  90, -36, -80, -36,  43,  83,   9, -83, -57,  36,  87,
+ 36, -87, -83,  57,  83,  -9, -36, -43, -36,  80,  83, -90, -83,  70,  36, -25,
+ 18,   9, -50, -25,  75,  43, -89, -57,  89,  70, -75, -80,  50,  87, -18, -90,
+-18,  90,  50, -87, -75,  80,  89, -70, -89,  57,  75, -43, -50,  25,  18,  -9,
+};
+
+ALIGNED(32) const int16_t  fi_dst7_16x16_coeff_hor[256] = {
+  8,  25,  17,  48,  25,  68,  33,  81,  40,  88,  48,  88,  55,  81,  62,  68,  // 0
+ 68,  48,  73,  25,  77,   0,  81, -25,  85, -48,  87, -68,  88, -81,  88, -88,
+ 40,  55,  73,  87,  88,  81,  85,  40,  62, -17,  25, -68, -17, -88, -55, -73,
+-81, -25, -88,  33, -77,  77, -48,  88,  -8,  62,  33,   8,  68, -48,  87, -85,
+ 68,  77,  88,  77,  48,   0, -25, -77, -81, -77, -81,   0, -25,  77,  48,  77,
+ 88,   0,  68, -77,   0, -77, -68,   0, -88,  77, -48,  77,  25,   0,  81, -77,
+ 85,  88,  55,  25, -48, -81, -87, -48,  -8,  68,  81,  68,  62, -48, -40, -81,
+-88,  25, -17,  88,  77,   0,  68, -88, -33, -25, -88,  81, -25,  48,  73, -68,
+ 88,  87,  -8, -40, -88, -68,  17,  73,  87,  33, -25, -88, -85,   8,  33,  85,  // 8
+ 81, -48, -40, -62, -77,  77,  48,  25,  73, -88, -55,  17, -68,  81,  62, -55,
+ 81,  73, -68, -85, -25,  25,  88,  55, -48, -88, -48,  48,  88,  33, -25, -87,
+-68,  68,  81,   8,   0, -77, -81,  81,  68, -17,  25, -62, -88,  88,  48, -40,
+ 62,  48, -88, -81,  68,  88,  -8, -68, -55,  25,  88,  25, -73, -68,  17,  88,
+ 48, -81, -87,  48,  77,   0, -25, -48, -40,  81,  85, -88, -81,  68,  33, -25,
+ 33,  17, -62, -33,  81,  48, -88, -62,  85,  73, -68, -81,  40,  87,  -8, -88,
+-25,  88,  55, -85, -77,  77,  88, -68, -87,  55,  73, -40, -48,  25,  17,  -8,
+};
+
+ALIGNED(32) const int16_t* fi_dct8_16x16_coeff_hor = ff_dct8_16x16_coeff_ver;
+
+
+            const int16_t* fi_dct2_16x16_coeff_ver = fi_dct2_16x16_coeff_hor;
+
+            const int16_t* fi_dst7_16x16_coeff_ver = fi_dst7_16x16_coeff_hor;
+
+            const int16_t* fi_dct8_16x16_coeff_ver = ff_dct8_16x16_coeff_ver;
+
+
+ALIGNED(32) const int16_t  ff_dct2_16x32_butterfly_o_row_coeff_ver[4096] = { // TODO: change this to 32-bit combined coeff table at some point, these huge tables are getting out of hand
+ 90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  // 0
+ 90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,
+ 88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,
+ 85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,
+ 82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,
+ 78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,
+ 73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,
+ 67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,
+ 61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  // 8
+ 54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,
+ 46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,
+ 38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,
+ 31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,
+ 22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,
+ 13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,
+  4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,
+ 90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  // 16
+ 82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,
+ 67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,
+ 46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,
+ 22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,
+ -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,
+-31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31,
+-54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54,
+-73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73,  // 24
+-85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85,
+-90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90,
+-88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88,
+-78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78,
+-61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61,
+-38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38,
+-13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13,
+ 88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  // 32
+ 67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,
+ 31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,
+-13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13,
+-54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54,
+-82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82,
+-90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90,
+-78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78,
+-46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46,  // 40
+ -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,
+ 38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,
+ 73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,
+ 90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,
+ 85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,
+ 61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,
+ 22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,
+ 85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  // 48
+ 46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,
+-13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13,
+-67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67,
+-90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90,
+-73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73,
+-22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22,
+ 38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,
+ 82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  // 56
+ 88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,
+ 54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,
+ -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,
+-61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61,
+-90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90,
+-78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78,
+-31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31,
+ 82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  // 64
+ 22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,
+-54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54,
+-90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90,
+-61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61,
+ 13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,
+ 78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,
+ 85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,
+ 31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  // 72
+-46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46,
+-90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90,
+-67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67,
+  4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,
+ 73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,
+ 88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,
+ 38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,
+ 78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  // 80
+ -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,
+-82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82,
+-73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73,
+ 13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,
+ 85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,
+ 67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,
+-22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22,
+-88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88,  // 88
+-61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61,
+ 31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,
+ 90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,
+ 54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,
+-38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38,
+-90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90,
+-46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46,
+ 73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  // 96
+-31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31,
+-90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90,
+-22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22,
+ 78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,
+ 67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,
+-38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38,
+-90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90,
+-13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13,  // 104
+ 82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,
+ 61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,
+-46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46,
+-88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88,
+ -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,
+ 85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,
+ 54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,
+ 67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  // 112
+-54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54,
+-78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78,
+ 38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,
+ 85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,
+-22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22,
+-90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90,
+  4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,
+ 90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  // 120
+ 13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,
+-88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88,
+-31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31,
+ 82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,
+ 46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,
+-73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73,
+-61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61,
+ 61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  // 128
+-73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73,
+-46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46,
+ 82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,
+ 31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,
+-88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88,
+-13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13,
+ 90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,
+ -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  // 136
+-90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90,
+ 22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,
+ 85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,
+-38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38,
+-78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78,
+ 54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,
+ 67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,
+ 54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  // 144
+-85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85,
+ -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,
+ 88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,
+-46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46,
+-61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61,
+ 82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,
+ 13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,
+-90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90,  // 152
+ 38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,
+ 67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,
+-78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78,
+-22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22,
+ 90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,
+-31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31,
+-73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73,
+ 46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  // 160
+-90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90,
+ 38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,
+ 54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,
+-90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90,
+ 31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,
+ 61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,
+-88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88,
+ 22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  // 168
+ 67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,
+-85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85,
+ 13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,
+ 73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,
+-82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82,
+  4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,
+ 78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,
+ 38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  // 176
+-88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88,
+ 73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,
+ -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,
+-67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67,
+ 90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,
+-46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46,
+-31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31,
+ 85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  // 184
+-78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78,
+ 13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,
+ 61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,
+-90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90,
+ 54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,
+ 22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,
+-82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82,
+ 31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  // 192
+-78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78,
+ 90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,
+-61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61,
+  4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,
+ 54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,
+-88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88,
+ 82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,
+-38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38,  // 200
+-22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22,
+ 73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,
+-90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90,
+ 67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,
+-13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13,
+-46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46,
+ 85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,
+ 22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  // 208
+-61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61,
+ 85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,
+-90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90,
+ 73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,
+-38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38,
+ -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,
+ 46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,
+-78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78,  // 216
+ 90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,
+-82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82,
+ 54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,
+-13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13,
+-31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31,
+ 67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,
+-88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88,
+ 13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  // 224
+-38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38,
+ 61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,
+-78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78,
+ 88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,
+-90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90,
+ 85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,
+-73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73,
+ 54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  // 232
+-31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31,
+  4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,
+ 22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,
+-46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46,
+ 67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,
+-82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82,
+ 90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,
+  4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,  // 240
+-13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13,
+ 22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,
+-31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31,
+ 38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,
+-46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46,
+ 54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,
+-61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61,
+ 67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  // 248
+-73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73,
+ 78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,
+-82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82,
+ 85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,
+-88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88,
+ 90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,
+-90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90,
+};
+
+ALIGNED(32) const int16_t  ff_dct2_16x32_coeff_ver[1024] = {
+ 64,  64,  90,  90,  90,  87,  90,  82,  89,  75,  88,  67,  87,  57,  85,  46,  // 0
+ 83,  36,  82,  22,  80,   9,  78,  -4,  75, -18,  73, -31,  70, -43,  67, -54,
+ 64, -64,  61, -73,  57, -80,  54, -85,  50, -89,  46, -90,  43, -90,  38, -88,
+ 36, -83,  31, -78,  25, -70,  22, -61,  18, -50,  13, -38,   9, -25,   4, -13,
+ 64,  64,  88,  85,  80,  70,  67,  46,  50,  18,  31, -13,   9, -43, -13, -67,
+-36, -83, -54, -90, -70, -87, -82, -73, -89, -50, -90, -22, -87,   9, -78,  38,
+-64,  64, -46,  82, -25,  90,  -4,  88,  18,  75,  38,  54,  57,  25,  73,  -4,
+ 83, -36,  90, -61,  90, -80,  85, -90,  75, -89,  61, -78,  43, -57,  22, -31,
+ 64,  64,  82,  78,  57,  43,  22,  -4, -18, -50, -54, -82, -80, -90, -90, -73,  // 8
+-83, -36, -61,  13, -25,  57,  13,  85,  50,  89,  78,  67,  90,  25,  85, -22,
+ 64, -64,  31, -88,  -9, -87, -46, -61, -75, -18, -90,  31, -87,  70, -67,  90,
+-36,  83,   4,  54,  43,   9,  73, -38,  89, -75,  88, -90,  70, -80,  38, -46,
+ 64,  64,  73,  67,  25,   9, -31, -54, -75, -89, -90, -78, -70, -25, -22,  38,
+ 36,  83,  78,  85,  90,  43,  67, -22,  18, -75, -38, -90, -80, -57, -90,   4,
+-64,  64, -13,  90,  43,  70,  82,  13,  89, -50,  61, -88,   9, -80, -46, -31,
+-83,  36, -88,  82, -57,  87,  -4,  46,  50, -18,  85, -73,  87, -90,  54, -61,
+ 64,  64,  61,  54,  -9, -25, -73, -85, -89, -75, -46,  -4,  25,  70,  82,  88,  // 16
+ 83,  36,  31, -46, -43, -90, -88, -61, -75,  18, -13,  82,  57,  80,  90,  13,
+ 64, -64,  -4, -90, -70, -43, -90,  38, -50,  89,  22,  67,  80,  -9,  85, -78,
+ 36, -83, -38, -22, -87,  57, -78,  90, -18,  50,  54, -31,  90, -87,  67, -73,
+ 64,  64,  46,  38, -43, -57, -90, -88, -50, -18,  38,  73,  90,  80,  54,  -4,
+-36, -83, -90, -67, -57,  25,  31,  90,  89,  50,  61, -46, -25, -90, -88, -31,
+-64,  64,  22,  85,  87,   9,  67, -78, -18, -75, -85,  13, -70,  87,  13,  61,
+ 83, -36,  73, -90,  -9, -43, -82,  54, -75,  89,   4,  22,  80, -70,  78, -82,
+ 64,  64,  31,  22, -70, -80, -78, -61,  18,  50,  90,  85,  43,  -9, -61, -90,  // 24
+-83, -36,   4,  73,  87,  70,  54, -38, -50, -89, -88,  -4,  -9,  87,  82,  46,
+ 64, -64, -38, -78, -90,  25, -22,  90,  75,  18,  73, -82, -25, -57, -90,  54,
+-36,  83,  67, -13,  80, -90, -13, -31, -89,  75, -46,  67,  57, -43,  85, -88,
+ 64,  64,  13,   4, -87, -90, -38, -13,  75,  89,  61,  22, -57, -87, -78, -31,
+ 36,  83,  88,  38,  -9, -80, -90, -46, -18,  75,  85,  54,  43, -70, -73, -61,
+-64,  64,  54,  67,  80, -57, -31, -73, -89,  50,   4,  78,  90, -43,  22, -82,
+-83,  36, -46,  85,  70, -25,  67, -88, -50,  18, -82,  90,  25,  -9,  90, -90,
+ 64,  64,  -4, -13, -90, -87,  13,  38,  89,  75, -22, -61, -87, -57,  31,  78,  // 32
+ 83,  36, -38, -88, -80,  -9,  46,  90,  75, -18, -54, -85, -70,  43,  61,  73,
+ 64, -64, -67, -54, -57,  80,  73,  31,  50, -89, -78,  -4, -43,  90,  82, -22,
+ 36, -83, -85,  46, -25,  70,  88, -67,  18, -50, -90,  82,  -9,  25,  90, -90,
+ 64,  64, -22, -31, -80, -70,  61,  78,  50,  18, -85, -90,  -9,  43,  90,  61,
+-36, -83, -73,  -4,  70,  87,  38, -54, -89, -50,   4,  88,  87,  -9, -46, -82,
+-64,  64,  78,  38,  25, -90, -90,  22,  18,  75,  82, -73, -57, -25, -54,  90,
+ 83, -36,  13, -67, -90,  80,  31,  13,  75, -89, -67,  46, -43,  57,  88, -85,
+ 64,  64, -38, -46, -57, -43,  88,  90, -18, -50, -73, -38,  80,  90,   4, -54,  // 40
+-83, -36,  67,  90,  25, -57, -90, -31,  50,  89,  46, -61, -90, -25,  31,  88,
+ 64, -64, -85, -22,   9,  87,  78, -67, -75, -18, -13,  85,  87, -70, -61, -13,
+-36,  83,  90, -73, -43,  -9, -54,  82,  89, -75, -22,  -4, -70,  80,  82, -78,
+ 64,  64, -54, -61, -25,  -9,  85,  73, -75, -89,   4,  46,  70,  25, -88, -82,
+ 36,  83,  46, -31, -90, -43,  61,  88,  18, -75, -82,  13,  80,  57, -13, -90,
+-64,  64,  90,   4, -43, -70, -38,  90,  89, -50, -67, -22,  -9,  80,  78, -85,
+-83,  36,  22,  38,  57, -87, -90,  78,  50, -18,  31, -54, -87,  90,  73, -67,
+ 64,  64, -67, -73,   9,  25,  54,  31, -89, -75,  78,  90, -25, -70, -38,  22,  // 48
+ 83,  36, -85, -78,  43,  90,  22, -67, -75,  18,  90,  38, -57, -80,  -4,  90,
+ 64, -64, -90,  13,  70,  43, -13, -82, -50,  89,  88, -61, -80,   9,  31,  46,
+ 36, -83, -82,  88,  87, -57, -46,   4, -18,  50,  73, -85, -90,  87,  61, -54,
+ 64,  64, -78, -82,  43,  57,   4, -22, -50, -18,  82,  54, -90, -80,  73,  90,
+-36, -83, -13,  61,  57, -25, -85, -13,  89,  50, -67, -78,  25,  90,  22, -85,
+-64,  64,  88, -31, -87,  -9,  61,  46, -18, -75, -31,  90,  70, -87, -90,  67,
+ 83, -36, -54,  -4,   9,  43,  38, -73, -75,  89,  90, -88, -80,  70,  46, -38,
+ 64,  64, -85, -88,  70,  80, -46, -67,  18,  50,  13, -31, -43,   9,  67,  13,  // 56
+-83, -36,  90,  54, -87, -70,  73,  82, -50, -89,  22,  90,   9, -87, -38,  78,
+ 64, -64, -82,  46,  90, -25, -88,   4,  75,  18, -54, -38,  25,  57,   4, -73,
+-36,  83,  61, -90, -80,  90,  90, -85, -89,  75,  78, -61, -57,  43,  31, -22,
+ 64,  64, -90, -90,  87,  90, -82, -90,  75,  89, -67, -88,  57,  87, -46, -85,
+ 36,  83, -22, -82,   9,  80,   4, -78, -18,  75,  31, -73, -43,  70,  54, -67,
+-64,  64,  73, -61, -80,  57,  85, -54, -89,  50,  90, -46, -90,  43,  88, -38,
+-83,  36,  78, -31, -70,  25,  61, -22, -50,  18,  38, -13, -25,   9,  13,  -4,
+};
+
+ALIGNED(32) const int16_t  ff_dst7_16x32_coeff_ver[1024] = {
+  4,   9,  13,  26,  21,  42,  30,  56,  38,  68,  46,  78,  53,  85,  60,  89,  // 0
+ 66,  90,  72,  86,  77,  80,  80,  72,  84,  60,  86,  46,  88,  30,  90,  13,
+ 90,  -4,  89, -21,  87, -38,  85, -53,  82, -66,  78, -77,  74, -84,  68, -88,
+ 63, -90,  56, -87,  50, -82,  42, -74,  34, -63,  26, -50,  17, -34,   9, -17,
+ 13,  17,  38,  50,  60,  74,  77,  87,  86,  88,  90,  77,  85,  53,  74,  21,
+ 56, -13,  34, -46,   9, -72, -17, -86, -42, -89, -63, -78, -78, -56, -87, -26,
+-90,   9, -84,  42, -72,  68, -53,  85, -30,  90,  -4,  80,  21,  60,  46,  30,
+ 66,  -4,  80, -38,  88, -66,  89, -84,  82, -90,  68, -82,  50, -63,  26, -34,
+ 21,  26,  60,  68,  84,  89,  89,  80,  74,  46,  42,  -4,   0, -53, -42, -84,  // 8
+-74, -87, -89, -63, -84, -17, -60,  34, -21,  74,  21,  90,  60,  77,  84,  38,
+ 89, -13,  74, -60,  42, -86,   0, -85, -42, -56, -74,  -9, -89,  42, -84,  78,
+-60,  90, -21,  72,  21,  30,  60, -21,  84, -66,  89, -88,  74, -82,  42, -50,
+ 30,  34,  77,  82,  89,  84,  63,  38,   9, -30, -50, -80, -85, -85, -84, -42,
+-46,  26,  13,  78,  66,  86,  90,  46,  74, -21,  26, -77, -34, -87, -78, -50,
+-88,  17, -60,  74,  -4,  88,  53,  53,  86, -13,  82, -72,  42, -89, -17, -56,
+-68,   9, -90,  68, -72,  90, -21,  60,  38,  -4,  80, -66,  87, -90,  56, -63,
+ 38,  42,  86,  89,  74,  60,   9, -21, -63, -84, -90, -74, -53,   0,  21,  74,  // 16
+ 80,  84,  82,  21,  26, -60, -50, -89, -89, -42, -66,  42,   4,  89,  72,  60,
+ 87, -21,  42, -84, -34, -74, -85,   0, -77,  74, -13,  84,  60,  21,  90, -60,
+ 56, -89, -17, -42, -78,  42, -84,  89, -30,  60,  46, -21,  88, -84,  68, -74,
+ 46,  50,  90,  88,  42,  21, -50, -72, -90, -78, -38,   9,  53,  85,  89,  60,
+ 34, -38, -56, -90, -88, -34, -30,  63,  60,  84,  87,   4,  26, -80, -63, -68,
+-86,  26, -21,  89,  66,  46,  85, -53,  17, -87, -68, -17, -84,  74, -13,  77,
+ 72, -13,  82, -86,   9, -56, -74,  42, -80,  90,  -4,  30,  77, -66,  78, -82,
+ 53,  56,  85,  80,   0, -21, -85, -90, -53, -17,  53,  82,  85,  53,   0, -60,  // 24
+-85, -78, -53,  26,  53,  90,  85,  13,   0, -84, -85, -50, -53,  63,  53,  77,
+ 85, -30,   0, -89, -85,  -9, -53,  85,  53,  46,  85, -66,   0, -74, -85,  34,
+-53,  88,  53,   4,  85, -86,   0, -42, -85,  68, -53,  72,  53, -38,  85, -87,
+ 60,  63,  74,  66, -42, -60, -84, -68,  21,  56,  89,  72,   0, -53, -89, -74,
+-21,  50,  84,  77,  42, -46, -74, -78, -60,  42,  60,  80,  74, -38, -42, -82,
+-84,  34,  21,  84,  89, -30,   0, -85, -89,  26, -21,  86,  84, -21,  42, -87,
+-74,  17, -60,  88,  60, -13,  74, -89, -42,   9, -84,  90,  21,  -4,  89, -90,
+ 66,  68,  56,  46, -74, -84, -46, -17,  80,  90,  34, -13, -85, -85, -21,  42,  // 32
+ 88,  72,   9, -66, -90, -50,   4,  82,  89,  21, -17, -90, -86,   9,  30,  86,
+ 82, -38, -42, -74, -77,  63,  53,  53,  68, -80, -63, -26, -60,  89,  72,  -4,
+ 50, -87, -78,  34, -38,  77,  84, -60,  26, -56, -87,  78, -13,  30,  90, -88,
+ 72,  74,  34,  21, -89, -89,  13,  42,  82,  60, -56, -84, -53,   0,  84,  84,
+  9, -60, -88, -42,  38,  89,  68, -21, -74, -74, -30,  74,  90,  21, -17, -89,
+-80,  42,  60,  60,  50, -84, -85,   0,  -4,  84,  87, -60, -42, -42, -66,  89,
+ 77, -21,  26, -74, -90,  74,  21,  21,  78, -89, -63,  42, -46,  60,  86, -84,
+ 77,  78,   9,  -4, -84, -74,  66,  82,  26, -13, -88, -68,  53,  85,  42, -21,  // 40
+-90, -63,  38,  87,  56, -30, -87, -56,  21,  89,  68, -38, -82, -50,   4,  90,
+ 78, -46, -74, -42, -13,  90,  85, -53, -63, -34, -30,  88,  89, -60, -50, -26,
+-46,  86,  90, -66, -34, -17, -60,  84,  86, -72, -17,  -9, -72,  80,  80, -77,
+ 80,  82, -17, -30, -60, -42,  90,  86, -50, -77, -30,  17,  85,  53, -74, -89,
+  4,  68,  68,  -4, -87, -63,  38,  90,  42, -60, -88,  -9,  66,  72,   9, -88,
+-77,  50,  84,  21, -26, -78, -53,  85,  90, -38, -56, -34, -21,  84,  82, -80,
+-78,  26,  13,  46,  63, -87, -89,  74,  46, -13,  34, -56, -86,  90,  72, -66,
+ 84,  85, -42, -53, -21,   0,  74,  53, -89, -85,  60,  85,   0, -53, -60,   0,  // 48
+ 89,  53, -74, -85,  21,  85,  42, -53, -84,   0,  84,  53, -42, -85, -21,  85,
+ 74, -53, -89,   0,  60,  53,   0, -85, -60,  85,  89, -53, -74,   0,  21,  53,
+ 42, -85, -84,  85,  84, -53, -42,   0, -21,  53,  74, -85, -89,  85,  60, -53,
+ 86,  87, -63, -72,  21,  42,  26,  -4, -66, -34,  87,  66, -85, -85,  60,  89,
+-17, -77, -30,  50,  68, -13, -88, -26,  84,  60, -56, -82,  13,  90,  34, -80,
+-72,  56,  89, -21, -82, -17,  53,  53,  -9, -78, -38,  90,  74, -84, -90,  63,
+ 80, -30, -50,  -9,   4,  46,  42, -74, -77,  88,  90, -86, -78,  68,  46, -38,
+ 88,  89, -78, -84,  60,  74, -34, -60,   4,  42,  26, -21, -53,   0,  74,  21,  // 56
+-86, -42,  90,  60, -82, -74,  66,  84, -42, -89,  13,  89,  17, -84, -46,  74,
+ 68, -60, -84,  42,  90, -21, -85,   0,  72,  21, -50, -42,  21,  60,   9, -74,
+-38,  84,  63, -89, -80,  89,  89, -84, -87,  74,  77, -60, -56,  42,  30, -21,
+ 90,  90, -87, -90,  84,  89, -78, -88,  72,  87, -63, -86,  53,  85, -42, -84,
+ 30,  82, -17, -80,   4,  78,   9, -77, -21,  74,  34, -72, -46,  68,  56, -66,
+-66,  63,  74, -60, -80,  56,  85, -53, -88,  50,  90, -46, -89,  42,  86, -38,
+-82,  34,  77, -30, -68,  26,  60, -21, -50,  17,  38, -13, -26,   9,  13,  -4,
+};
+
+ALIGNED(32) const int16_t  ff_dct8_16x32_coeff_ver[1024] = {
+ 90,  90,  90,  87,  89,  84,  88,  78,  87,  72,  86,  63,  85,  53,  84,  42,  // 0
+ 82,  30,  80,  17,  78,   4,  77,  -9,  74, -21,  72, -34,  68, -46,  66, -56,
+ 63, -66,  60, -74,  56, -80,  53, -85,  50, -88,  46, -90,  42, -89,  38, -86,
+ 34, -82,  30, -77,  26, -68,  21, -60,  17, -50,  13, -38,   9, -26,   4, -13,
+ 89,  88,  84,  78,  74,  60,  60,  34,  42,   4,  21, -26,   0, -53, -21, -74,
+-42, -86, -60, -90, -74, -82, -84, -66, -89, -42, -89, -13, -84,  17, -74,  46,
+-60,  68, -42,  84, -21,  90,   0,  85,  21,  72,  42,  50,  60,  21,  74,  -9,
+ 84, -38,  89, -63,  89, -80,  84, -89,  74, -87,  60, -77,  42, -56,  21, -30,
+ 87,  86,  72,  63,  42,  21,   4, -26, -34, -66, -66, -87, -85, -85, -89, -60,  // 8
+-77, -17, -50,  30, -13,  68,  26,  88,  60,  84,  82,  56,  90,  13,  80, -34,
+ 56, -72,  21, -89, -17, -82, -53, -53, -78,  -9, -90,  38, -84,  74, -63,  90,
+-30,  80,   9,  50,  46,   4,  74, -42,  88, -77,  86, -90,  68, -78,  38, -46,
+ 85,  84,  53,  42,   0, -21, -53, -74, -85, -89, -85, -60, -53,   0,   0,  60,
+ 53,  89,  85,  74,  85,  21,  53, -42,   0, -84, -53, -84, -85, -42, -85,  21,
+-53,  74,   0,  89,  53,  60,  85,   0,  85, -60,  53, -89,   0, -74, -53, -21,
+-85,  42, -85,  84, -53,  84,   0,  42,  53, -21,  85, -74,  85, -89,  53, -60,
+ 82,  80,  30,  17, -42, -60, -86, -90, -77, -50, -17,  30,  53,  85,  89,  74,  // 16
+ 68,   4,   4, -68, -63, -87, -90, -38, -60,  42,   9,  88,  72,  66,  88,  -9,
+ 50, -77, -21, -84, -78, -26, -85,  53, -38,  90,  34,  56,  84, -21,  80, -82,
+ 26, -78, -46, -13, -87,  63, -74,  89, -13,  46,  56, -34,  90, -86,  66, -72,
+ 78,  77,   4,  -9, -74, -84, -82, -66, -13,  26,  68,  88,  85,  53,  21, -42,
+-63, -90, -87, -38, -30,  56,  56,  87,  89,  21,  38, -68, -50, -82, -90,  -4,
+-46,  78,  42,  74,  90, -13,  53, -85, -34, -63, -88,  30, -60,  89,  26,  50,
+ 86, -46,  66, -90, -17, -34, -84,  60, -72,  86,   9,  17,  80, -72,  77, -80,
+ 74,  72, -21, -34, -89, -89, -42, -13,  60,  82,  84,  56,   0, -53, -84, -84,  // 24
+-60,   9,  42,  88,  89,  38,  21, -68, -74, -74, -74,  30,  21,  90,  89,  17,
+ 42, -80, -60, -60, -84,  50,   0,  85,  84,  -4,  60, -87, -42, -42, -89,  66,
+-21,  77,  74, -26,  74, -90, -21, -21, -89,  78, -42,  63,  60, -46,  84, -86,
+ 68,  66, -46, -56, -84, -74,  17,  46,  90,  80,  13, -34, -85, -85, -42,  21,
+ 72,  88,  66,  -9, -50, -90, -82,  -4,  21,  89,  90,  17,   9, -86, -86, -30,
+-38,  82,  74,  42,  63, -77, -53, -53, -80,  68,  26,  63,  89, -60,   4, -72,
+-87,  50, -34,  78,  77, -38,  60, -84, -56,  26, -78,  87,  30, -13,  88, -90,
+ 63,  60, -66, -74, -60, -42,  68,  84,  56,  21, -72, -89, -53,   0,  74,  89,  // 32
+ 50, -21, -77, -84, -46,  42,  78,  74,  42, -60, -80, -60, -38,  74,  82,  42,
+ 34, -84, -84, -21, -30,  89,  85,   0,  26, -89, -86,  21, -21,  84,  87, -42,
+ 17, -74, -88,  60, -13,  60,  89, -74,   9, -42, -90,  84,  -4,  21,  90, -89,
+ 56,  53, -80, -85, -21,   0,  90,  85, -17, -53, -82, -53,  53,  85,  60,   0,
+-78, -85, -26,  53,  90,  53, -13, -85, -84,   0,  50,  85,  63, -53, -77, -53,
+-30,  85,  89,   0,  -9, -85, -85,  53,  46,  53,  66, -85, -74,   0, -34,  85,
+ 88, -53,  -4, -53, -86,  85,  42,   0,  68, -85, -72,  53, -38,  53,  87, -85,
+ 50,  46, -88, -90,  21,  42,  72,  50, -78, -90,  -9,  38,  85,  53, -60, -89,  // 40
+-38,  34,  90,  56, -34, -88, -63,  30,  84,  60,  -4, -87, -80,  26,  68,  63,
+ 26, -86, -89,  21,  46,  66,  53, -85, -87,  17,  17,  68,  74, -84, -77,  13,
+-13,  72,  86, -82, -56,   9, -42,  74,  90, -80, -30,   4, -66,  77,  82, -78,
+ 42,  38, -89, -86,  60,  74,  21,  -9, -84, -63,  74,  90,   0, -53, -74, -21,
+ 84,  80, -21, -82, -60,  26,  89,  50, -42, -89, -42,  66,  89,   4, -60, -72,
+-21,  87,  84, -42, -74, -34,   0,  85,  74, -77, -84,  13,  21,  60,  60, -90,
+-89,  56,  42,  17,  42, -78, -89,  84,  60, -30,  21, -46, -84,  88,  74, -68,
+ 34,  30, -82, -77,  84,  89, -38, -63, -30,   9,  80,  50, -85, -85,  42,  84,  // 48
+ 26, -46, -78, -13,  86,  66, -46, -90, -21,  74,  77, -26, -87, -34,  50,  78,
+ 17, -88, -74,  60,  88,  -4, -53, -53, -13,  86,  72, -82, -89,  42,  56,  17,
+  9, -68, -68,  90,  90, -72, -60,  21,  -4,  38,  66, -80, -90,  87,  63, -56,
+ 26,  21, -68, -60,  89,  84, -80, -89,  46,  74,   4, -42, -53,   0,  84,  42,
+-87, -74,  63,  89, -17, -84, -34,  60,  74, -21, -90, -21,  77,  60, -38, -84,
+-13,  89,  60, -74, -86,  42,  85,   0, -56, -42,   9,  74,  42, -89, -78,  84,
+ 90, -60, -72,  21,  30,  21,  21, -60, -66,  84,  88, -89, -82,  74,  50, -42,
+ 17,  13, -50, -38,  74,  60, -87, -77,  88,  86, -77, -90,  53,  85, -21, -74,  // 56
+-13,  56,  46, -34, -72,   9,  86,  17, -89, -42,  78,  63, -56, -78,  26,  87,
+  9, -90, -42,  84,  68, -72, -85,  53,  90, -30, -80,   4,  60,  21, -30, -46,
+ -4,  66,  38, -80, -66,  88,  84, -89, -90,  82,  82, -68, -63,  50,  34, -26,
+  9,   4, -26, -13,  42,  21, -56, -30,  68,  38, -78, -46,  85,  53, -89, -60,
+ 90,  66, -86, -72,  80,  77, -72, -80,  60,  84, -46, -86,  30,  88, -13, -90,
+ -4,  90,  21, -89, -38,  87,  53, -85, -66,  82,  77, -78, -84,  74,  88, -68,
+-90,  63,  87, -56, -82,  50,  74, -42, -63,  34,  50, -26, -34,  17,  17,  -9,
+};
+
+
+            const int16_t* fi_dct2_16x32_coeff_hor = fi_dct2_16x16_coeff_hor;
+
+            const int16_t* fi_dst7_16x32_coeff_hor = fi_dst7_16x16_coeff_hor;
+
+            const int16_t* fi_dct8_16x32_coeff_hor = ff_dct8_16x16_coeff_ver;
+
+// 32xN
+ALIGNED(32) const int16_t  ff_dct2_32xN_coeff_hor[1024] = {
+ 64,  64,  90,  90,  90,  87,  90,  82,  89,  75,  88,  67,  87,  57,  85,  46,  // 0
+ 83,  36,  82,  22,  80,   9,  78,  -4,  75, -18,  73, -31,  70, -43,  67, -54,
+ 64, -64,  61, -73,  57, -80,  54, -85,  50, -89,  46, -90,  43, -90,  38, -88,
+ 36, -83,  31, -78,  25, -70,  22, -61,  18, -50,  13, -38,   9, -25,   4, -13,
+ 64,  64,  88,  85,  80,  70,  67,  46,  50,  18,  31, -13,   9, -43, -13, -67,
+-36, -83, -54, -90, -70, -87, -82, -73, -89, -50, -90, -22, -87,   9, -78,  38,
+-64,  64, -46,  82, -25,  90,  -4,  88,  18,  75,  38,  54,  57,  25,  73,  -4,
+ 83, -36,  90, -61,  90, -80,  85, -90,  75, -89,  61, -78,  43, -57,  22, -31,
+ 64,  64,  82,  78,  57,  43,  22,  -4, -18, -50, -54, -82, -80, -90, -90, -73,  // 8
+-83, -36, -61,  13, -25,  57,  13,  85,  50,  89,  78,  67,  90,  25,  85, -22,
+ 64, -64,  31, -88,  -9, -87, -46, -61, -75, -18, -90,  31, -87,  70, -67,  90,
+-36,  83,   4,  54,  43,   9,  73, -38,  89, -75,  88, -90,  70, -80,  38, -46,
+ 64,  64,  73,  67,  25,   9, -31, -54, -75, -89, -90, -78, -70, -25, -22,  38,
+ 36,  83,  78,  85,  90,  43,  67, -22,  18, -75, -38, -90, -80, -57, -90,   4,
+-64,  64, -13,  90,  43,  70,  82,  13,  89, -50,  61, -88,   9, -80, -46, -31,
+-83,  36, -88,  82, -57,  87,  -4,  46,  50, -18,  85, -73,  87, -90,  54, -61,
+ 64,  64,  61,  54,  -9, -25, -73, -85, -89, -75, -46,  -4,  25,  70,  82,  88,  // 16
+ 83,  36,  31, -46, -43, -90, -88, -61, -75,  18, -13,  82,  57,  80,  90,  13,
+ 64, -64,  -4, -90, -70, -43, -90,  38, -50,  89,  22,  67,  80,  -9,  85, -78,
+ 36, -83, -38, -22, -87,  57, -78,  90, -18,  50,  54, -31,  90, -87,  67, -73,
+ 64,  64,  46,  38, -43, -57, -90, -88, -50, -18,  38,  73,  90,  80,  54,  -4,
+-36, -83, -90, -67, -57,  25,  31,  90,  89,  50,  61, -46, -25, -90, -88, -31,
+-64,  64,  22,  85,  87,   9,  67, -78, -18, -75, -85,  13, -70,  87,  13,  61,
+ 83, -36,  73, -90,  -9, -43, -82,  54, -75,  89,   4,  22,  80, -70,  78, -82,
+ 64,  64,  31,  22, -70, -80, -78, -61,  18,  50,  90,  85,  43,  -9, -61, -90,  // 24
+-83, -36,   4,  73,  87,  70,  54, -38, -50, -89, -88,  -4,  -9,  87,  82,  46,
+ 64, -64, -38, -78, -90,  25, -22,  90,  75,  18,  73, -82, -25, -57, -90,  54,
+-36,  83,  67, -13,  80, -90, -13, -31, -89,  75, -46,  67,  57, -43,  85, -88,
+ 64,  64,  13,   4, -87, -90, -38, -13,  75,  89,  61,  22, -57, -87, -78, -31,
+ 36,  83,  88,  38,  -9, -80, -90, -46, -18,  75,  85,  54,  43, -70, -73, -61,
+-64,  64,  54,  67,  80, -57, -31, -73, -89,  50,   4,  78,  90, -43,  22, -82,
+-83,  36, -46,  85,  70, -25,  67, -88, -50,  18, -82,  90,  25,  -9,  90, -90,
+ 64,  64,  -4, -13, -90, -87,  13,  38,  89,  75, -22, -61, -87, -57,  31,  78,  // 32
+ 83,  36, -38, -88, -80,  -9,  46,  90,  75, -18, -54, -85, -70,  43,  61,  73,
+ 64, -64, -67, -54, -57,  80,  73,  31,  50, -89, -78,  -4, -43,  90,  82, -22,
+ 36, -83, -85,  46, -25,  70,  88, -67,  18, -50, -90,  82,  -9,  25,  90, -90,
+ 64,  64, -22, -31, -80, -70,  61,  78,  50,  18, -85, -90,  -9,  43,  90,  61,
+-36, -83, -73,  -4,  70,  87,  38, -54, -89, -50,   4,  88,  87,  -9, -46, -82,
+-64,  64,  78,  38,  25, -90, -90,  22,  18,  75,  82, -73, -57, -25, -54,  90,
+ 83, -36,  13, -67, -90,  80,  31,  13,  75, -89, -67,  46, -43,  57,  88, -85,
+ 64,  64, -38, -46, -57, -43,  88,  90, -18, -50, -73, -38,  80,  90,   4, -54,  // 40
+-83, -36,  67,  90,  25, -57, -90, -31,  50,  89,  46, -61, -90, -25,  31,  88,
+ 64, -64, -85, -22,   9,  87,  78, -67, -75, -18, -13,  85,  87, -70, -61, -13,
+-36,  83,  90, -73, -43,  -9, -54,  82,  89, -75, -22,  -4, -70,  80,  82, -78,
+ 64,  64, -54, -61, -25,  -9,  85,  73, -75, -89,   4,  46,  70,  25, -88, -82,
+ 36,  83,  46, -31, -90, -43,  61,  88,  18, -75, -82,  13,  80,  57, -13, -90,
+-64,  64,  90,   4, -43, -70, -38,  90,  89, -50, -67, -22,  -9,  80,  78, -85,
+-83,  36,  22,  38,  57, -87, -90,  78,  50, -18,  31, -54, -87,  90,  73, -67,
+ 64,  64, -67, -73,   9,  25,  54,  31, -89, -75,  78,  90, -25, -70, -38,  22,  // 48
+ 83,  36, -85, -78,  43,  90,  22, -67, -75,  18,  90,  38, -57, -80,  -4,  90,
+ 64, -64, -90,  13,  70,  43, -13, -82, -50,  89,  88, -61, -80,   9,  31,  46,
+ 36, -83, -82,  88,  87, -57, -46,   4, -18,  50,  73, -85, -90,  87,  61, -54,
+ 64,  64, -78, -82,  43,  57,   4, -22, -50, -18,  82,  54, -90, -80,  73,  90,
+-36, -83, -13,  61,  57, -25, -85, -13,  89,  50, -67, -78,  25,  90,  22, -85,
+-64,  64,  88, -31, -87,  -9,  61,  46, -18, -75, -31,  90,  70, -87, -90,  67,
+ 83, -36, -54,  -4,   9,  43,  38, -73, -75,  89,  90, -88, -80,  70,  46, -38,
+ 64,  64, -85, -88,  70,  80, -46, -67,  18,  50,  13, -31, -43,   9,  67,  13,  // 56
+-83, -36,  90,  54, -87, -70,  73,  82, -50, -89,  22,  90,   9, -87, -38,  78,
+ 64, -64, -82,  46,  90, -25, -88,   4,  75,  18, -54, -38,  25,  57,   4, -73,
+-36,  83,  61, -90, -80,  90,  90, -85, -89,  75,  78, -61, -57,  43,  31, -22,
+ 64,  64, -90, -90,  87,  90, -82, -90,  75,  89, -67, -88,  57,  87, -46, -85,
+ 36,  83, -22, -82,   9,  80,   4, -78, -18,  75,  31, -73, -43,  70,  54, -67,
+-64,  64,  73, -61, -80,  57,  85, -54, -89,  50,  90, -46, -90,  43,  88, -38,
+-83,  36,  78, -31, -70,  25,  61, -22, -50,  18,  38, -13, -25,   9,  13,  -4,
+};
+
+ALIGNED(32) const int16_t  ff_dst7_32xN_coeff_hor[1024] = {
+ 4,   9,  13,  26,  21,  42,  30,  56,  38,  68,  46,  78,  53,  85,  60,  89,  66,  90,  72,  86,  77,  80,  80,  72,  84,  60,  86,  46,  88,  30,  90,  13,  // 0
+ 90,  -4,  89, -21,  87, -38,  85, -53,  82, -66,  78, -77,  74, -84,  68, -88,  63, -90,  56, -87,  50, -82,  42, -74,  34, -63,  26, -50,  17, -34,   9, -17,
+ 13,  17,  38,  50,  60,  74,  77,  87,  86,  88,  90,  77,  85,  53,  74,  21,  56, -13,  34, -46,   9, -72, -17, -86, -42, -89, -63, -78, -78, -56, -87, -26,  // 2
+-90,   9, -84,  42, -72,  68, -53,  85, -30,  90,  -4,  80,  21,  60,  46,  30,  66,  -4,  80, -38,  88, -66,  89, -84,  82, -90,  68, -82,  50, -63,  26, -34,
+ 38,  42,  86,  89,  74,  60,   9, -21, -63, -84, -90, -74, -53,   0,  21,  74,  80,  84,  82,  21,  26, -60, -50, -89, -89, -42, -66,  42,   4,  89,  72,  60,  // 8
+ 87, -21,  42, -84, -34, -74, -85,   0, -77,  74, -13,  84,  60,  21,  90, -60,  56, -89, -17, -42, -78,  42, -84,  89, -30,  60,  46, -21,  88, -84,  68, -74,
+ 46,  50,  90,  88,  42,  21, -50, -72, -90, -78, -38,   9,  53,  85,  89,  60,  34, -38, -56, -90, -88, -34, -30,  63,  60,  84,  87,   4,  26, -80, -63, -68,  // 10
+-86,  26, -21,  89,  66,  46,  85, -53,  17, -87, -68, -17, -84,  74, -13,  77,  72, -13,  82, -86,   9, -56, -74,  42, -80,  90,  -4,  30,  77, -66,  78, -82,
+ 21,  26,  60,  68,  84,  89,  89,  80,  74,  46,  42,  -4,   0, -53, -42, -84, -74, -87, -89, -63, -84, -17, -60,  34, -21,  74,  21,  90,  60,  77,  84,  38,  // 4
+ 89, -13,  74, -60,  42, -86,   0, -85, -42, -56, -74,  -9, -89,  42, -84,  78, -60,  90, -21,  72,  21,  30,  60, -21,  84, -66,  89, -88,  74, -82,  42, -50,
+ 30,  34,  77,  82,  89,  84,  63,  38,   9, -30, -50, -80, -85, -85, -84, -42, -46,  26,  13,  78,  66,  86,  90,  46,  74, -21,  26, -77, -34, -87, -78, -50,  // 6
+-88,  17, -60,  74,  -4,  88,  53,  53,  86, -13,  82, -72,  42, -89, -17, -56, -68,   9, -90,  68, -72,  90, -21,  60,  38,  -4,  80, -66,  87, -90,  56, -63,
+ 53,  56,  85,  80,   0, -21, -85, -90, -53, -17,  53,  82,  85,  53,   0, -60, -85, -78, -53,  26,  53,  90,  85,  13,   0, -84, -85, -50, -53,  63,  53,  77,  // 12
+ 85, -30,   0, -89, -85,  -9, -53,  85,  53,  46,  85, -66,   0, -74, -85,  34, -53,  88,  53,   4,  85, -86,   0, -42, -85,  68, -53,  72,  53, -38,  85, -87,
+ 60,  63,  74,  66, -42, -60, -84, -68,  21,  56,  89,  72,   0, -53, -89, -74, -21,  50,  84,  77,  42, -46, -74, -78, -60,  42,  60,  80,  74, -38, -42, -82,  // 14
+-84,  34,  21,  84,  89, -30,   0, -85, -89,  26, -21,  86,  84, -21,  42, -87, -74,  17, -60,  88,  60, -13,  74, -89, -42,   9, -84,  90,  21,  -4,  89, -90,
+ 66,  68,  56,  46, -74, -84, -46, -17,  80,  90,  34, -13, -85, -85, -21,  42,  88,  72,   9, -66, -90, -50,   4,  82,  89,  21, -17, -90, -86,   9,  30,  86,  // 16
+ 82, -38, -42, -74, -77,  63,  53,  53,  68, -80, -63, -26, -60,  89,  72,  -4,  50, -87, -78,  34, -38,  77,  84, -60,  26, -56, -87,  78, -13,  30,  90, -88,
+ 72,  74,  34,  21, -89, -89,  13,  42,  82,  60, -56, -84, -53,   0,  84,  84,   9, -60, -88, -42,  38,  89,  68, -21, -74, -74, -30,  74,  90,  21, -17, -89,  // 18
+-80,  42,  60,  60,  50, -84, -85,   0,  -4,  84,  87, -60, -42, -42, -66,  89,  77, -21,  26, -74, -90,  74,  21,  21,  78, -89, -63,  42, -46,  60,  86, -84,
+ 84,  85, -42, -53, -21,   0,  74,  53, -89, -85,  60,  85,   0, -53, -60,   0,  89,  53, -74, -85,  21,  85,  42, -53, -84,   0,  84,  53, -42, -85, -21,  85,  // 24
+ 74, -53, -89,   0,  60,  53,   0, -85, -60,  85,  89, -53, -74,   0,  21,  53,  42, -85, -84,  85,  84, -53, -42,   0, -21,  53,  74, -85, -89,  85,  60, -53,
+ 86,  87, -63, -72,  21,  42,  26,  -4, -66, -34,  87,  66, -85, -85,  60,  89, -17, -77, -30,  50,  68, -13, -88, -26,  84,  60, -56, -82,  13,  90,  34, -80,  // 26
+-72,  56,  89, -21, -82, -17,  53,  53,  -9, -78, -38,  90,  74, -84, -90,  63,  80, -30, -50,  -9,   4,  46,  42, -74, -77,  88,  90, -86, -78,  68,  46, -38,
+ 77,  78,   9,  -4, -84, -74,  66,  82,  26, -13, -88, -68,  53,  85,  42, -21, -90, -63,  38,  87,  56, -30, -87, -56,  21,  89,  68, -38, -82, -50,   4,  90,  // 20
+ 78, -46, -74, -42, -13,  90,  85, -53, -63, -34, -30,  88,  89, -60, -50, -26, -46,  86,  90, -66, -34, -17, -60,  84,  86, -72, -17,  -9, -72,  80,  80, -77,
+ 80,  82, -17, -30, -60, -42,  90,  86, -50, -77, -30,  17,  85,  53, -74, -89,   4,  68,  68,  -4, -87, -63,  38,  90,  42, -60, -88,  -9,  66,  72,   9, -88,  // 22
+-77,  50,  84,  21, -26, -78, -53,  85,  90, -38, -56, -34, -21,  84,  82, -80, -78,  26,  13,  46,  63, -87, -89,  74,  46, -13,  34, -56, -86,  90,  72, -66,
+ 88,  89, -78, -84,  60,  74, -34, -60,   4,  42,  26, -21, -53,   0,  74,  21, -86, -42,  90,  60, -82, -74,  66,  84, -42, -89,  13,  89,  17, -84, -46,  74,  // 28
+ 68, -60, -84,  42,  90, -21, -85,   0,  72,  21, -50, -42,  21,  60,   9, -74, -38,  84,  63, -89, -80,  89,  89, -84, -87,  74,  77, -60, -56,  42,  30, -21,
+ 90,  90, -87, -90,  84,  89, -78, -88,  72,  87, -63, -86,  53,  85, -42, -84,  30,  82, -17, -80,   4,  78,   9, -77, -21,  74,  34, -72, -46,  68,  56, -66,  // 30
+-66,  63,  74, -60, -80,  56,  85, -53, -88,  50,  90, -46, -89,  42,  86, -38, -82,  34,  77, -30, -68,  26,  60, -21, -50,  17,  38, -13, -26,   9,  13,  -4,
+};
+
+ALIGNED(32) const int16_t  ff_dct8_32xN_coeff_hor[1024] = {
+ 90,  90,  90,  87,  89,  84,  88,  78,  87,  72,  86,  63,  85,  53,  84,  42,  82,  30,  80,  17,  78,   4,  77,  -9,  74, -21,  72, -34,  68, -46,  66, -56,  // 0
+ 63, -66,  60, -74,  56, -80,  53, -85,  50, -88,  46, -90,  42, -89,  38, -86,  34, -82,  30, -77,  26, -68,  21, -60,  17, -50,  13, -38,   9, -26,   4, -13,
+ 89,  88,  84,  78,  74,  60,  60,  34,  42,   4,  21, -26,   0, -53, -21, -74, -42, -86, -60, -90, -74, -82, -84, -66, -89, -42, -89, -13, -84,  17, -74,  46,  // 2
+-60,  68, -42,  84, -21,  90,   0,  85,  21,  72,  42,  50,  60,  21,  74,  -9,  84, -38,  89, -63,  89, -80,  84, -89,  74, -87,  60, -77,  42, -56,  21, -30,
+ 82,  80,  30,  17, -42, -60, -86, -90, -77, -50, -17,  30,  53,  85,  89,  74,  68,   4,   4, -68, -63, -87, -90, -38, -60,  42,   9,  88,  72,  66,  88,  -9,  // 8
+ 50, -77, -21, -84, -78, -26, -85,  53, -38,  90,  34,  56,  84, -21,  80, -82,  26, -78, -46, -13, -87,  63, -74,  89, -13,  46,  56, -34,  90, -86,  66, -72,
+ 78,  77,   4,  -9, -74, -84, -82, -66, -13,  26,  68,  88,  85,  53,  21, -42, -63, -90, -87, -38, -30,  56,  56,  87,  89,  21,  38, -68, -50, -82, -90,  -4,  // 10
+-46,  78,  42,  74,  90, -13,  53, -85, -34, -63, -88,  30, -60,  89,  26,  50,  86, -46,  66, -90, -17, -34, -84,  60, -72,  86,   9,  17,  80, -72,  77, -80,
+ 87,  86,  72,  63,  42,  21,   4, -26, -34, -66, -66, -87, -85, -85, -89, -60, -77, -17, -50,  30, -13,  68,  26,  88,  60,  84,  82,  56,  90,  13,  80, -34,  // 4
+ 56, -72,  21, -89, -17, -82, -53, -53, -78,  -9, -90,  38, -84,  74, -63,  90, -30,  80,   9,  50,  46,   4,  74, -42,  88, -77,  86, -90,  68, -78,  38, -46,
+ 85,  84,  53,  42,   0, -21, -53, -74, -85, -89, -85, -60, -53,   0,   0,  60,  53,  89,  85,  74,  85,  21,  53, -42,   0, -84, -53, -84, -85, -42, -85,  21,  // 6
+-53,  74,   0,  89,  53,  60,  85,   0,  85, -60,  53, -89,   0, -74, -53, -21, -85,  42, -85,  84, -53,  84,   0,  42,  53, -21,  85, -74,  85, -89,  53, -60,
+ 74,  72, -21, -34, -89, -89, -42, -13,  60,  82,  84,  56,   0, -53, -84, -84, -60,   9,  42,  88,  89,  38,  21, -68, -74, -74, -74,  30,  21,  90,  89,  17,  // 12
+ 42, -80, -60, -60, -84,  50,   0,  85,  84,  -4,  60, -87, -42, -42, -89,  66, -21,  77,  74, -26,  74, -90, -21, -21, -89,  78, -42,  63,  60, -46,  84, -86,
+ 68,  66, -46, -56, -84, -74,  17,  46,  90,  80,  13, -34, -85, -85, -42,  21,  72,  88,  66,  -9, -50, -90, -82,  -4,  21,  89,  90,  17,   9, -86, -86, -30,  // 14
+-38,  82,  74,  42,  63, -77, -53, -53, -80,  68,  26,  63,  89, -60,   4, -72, -87,  50, -34,  78,  77, -38,  60, -84, -56,  26, -78,  87,  30, -13,  88, -90,
+ 63,  60, -66, -74, -60, -42,  68,  84,  56,  21, -72, -89, -53,   0,  74,  89,  50, -21, -77, -84, -46,  42,  78,  74,  42, -60, -80, -60, -38,  74,  82,  42,  // 16
+ 34, -84, -84, -21, -30,  89,  85,   0,  26, -89, -86,  21, -21,  84,  87, -42,  17, -74, -88,  60, -13,  60,  89, -74,   9, -42, -90,  84,  -4,  21,  90, -89,
+ 56,  53, -80, -85, -21,   0,  90,  85, -17, -53, -82, -53,  53,  85,  60,   0, -78, -85, -26,  53,  90,  53, -13, -85, -84,   0,  50,  85,  63, -53, -77, -53,  // 18
+-30,  85,  89,   0,  -9, -85, -85,  53,  46,  53,  66, -85, -74,   0, -34,  85,  88, -53,  -4, -53, -86,  85,  42,   0,  68, -85, -72,  53, -38,  53,  87, -85,
+ 34,  30, -82, -77,  84,  89, -38, -63, -30,   9,  80,  50, -85, -85,  42,  84,  26, -46, -78, -13,  86,  66, -46, -90, -21,  74,  77, -26, -87, -34,  50,  78,  // 24
+ 17, -88, -74,  60,  88,  -4, -53, -53, -13,  86,  72, -82, -89,  42,  56,  17,   9, -68, -68,  90,  90, -72, -60,  21,  -4,  38,  66, -80, -90,  87,  63, -56,
+ 26,  21, -68, -60,  89,  84, -80, -89,  46,  74,   4, -42, -53,   0,  84,  42, -87, -74,  63,  89, -17, -84, -34,  60,  74, -21, -90, -21,  77,  60, -38, -84,  // 26
+-13,  89,  60, -74, -86,  42,  85,   0, -56, -42,   9,  74,  42, -89, -78,  84,  90, -60, -72,  21,  30,  21,  21, -60, -66,  84,  88, -89, -82,  74,  50, -42,
+ 50,  46, -88, -90,  21,  42,  72,  50, -78, -90,  -9,  38,  85,  53, -60, -89, -38,  34,  90,  56, -34, -88, -63,  30,  84,  60,  -4, -87, -80,  26,  68,  63,  // 20
+ 26, -86, -89,  21,  46,  66,  53, -85, -87,  17,  17,  68,  74, -84, -77,  13, -13,  72,  86, -82, -56,   9, -42,  74,  90, -80, -30,   4, -66,  77,  82, -78,
+ 42,  38, -89, -86,  60,  74,  21,  -9, -84, -63,  74,  90,   0, -53, -74, -21,  84,  80, -21, -82, -60,  26,  89,  50, -42, -89, -42,  66,  89,   4, -60, -72,  // 22
+-21,  87,  84, -42, -74, -34,   0,  85,  74, -77, -84,  13,  21,  60,  60, -90, -89,  56,  42,  17,  42, -78, -89,  84,  60, -30,  21, -46, -84,  88,  74, -68,
+ 17,  13, -50, -38,  74,  60, -87, -77,  88,  86, -77, -90,  53,  85, -21, -74, -13,  56,  46, -34, -72,   9,  86,  17, -89, -42,  78,  63, -56, -78,  26,  87,  // 28
+  9, -90, -42,  84,  68, -72, -85,  53,  90, -30, -80,   4,  60,  21, -30, -46,  -4,  66,  38, -80, -66,  88,  84, -89, -90,  82,  82, -68, -63,  50,  34, -26,
+  9,   4, -26, -13,  42,  21, -56, -30,  68,  38, -78, -46,  85,  53, -89, -60,  90,  66, -86, -72,  80,  77, -72, -80,  60,  84, -46, -86,  30,  88, -13, -90,  // 30
+ -4,  90,  21, -89, -38,  87,  53, -85, -66,  82,  77, -78, -84,  74,  88, -68, -90,  63,  87, -56, -82,  50,  74, -42, -63,  34,  50, -26, -34,  17,  17,  -9,
+};
+
+ALIGNED(32) const int16_t  fi_dct2_32xN_coeff_hor[1024] = {
+64,  90,  64,  90,  64,  88,  64,  85,  64,  82,  64,  78,  64,  73,  64,  67,  64,  61,  64,  54,  64,  46,  64,  38,  64,  31,  64,  22,  64,  13,  64,   4,  // 0
+ 64,  -4,  64, -13,  64, -22,  64, -31,  64, -38,  64, -46,  64, -54,  64, -61,  64, -67,  64, -73,  64, -78,  64, -82,  64, -85,  64, -88,  64, -90,  64, -90,
+ 90,  90,  87,  82,  80,  67,  70,  46,  57,  22,  43,  -4,  25, -31,   9, -54,  -9, -73, -25, -85, -43, -90, -57, -88, -70, -78, -80, -61, -87, -38, -90, -13,  // 2
+-90,  13, -87,  38, -80,  61, -70,  78, -57,  88, -43,  90, -25,  85,  -9,  73,   9,  54,  25,  31,  43,   4,  57, -22,  70, -46,  80, -67,  87, -82,  90, -90,
+ 89,  88,  75,  67,  50,  31,  18, -13, -18, -54, -50, -82, -75, -90, -89, -78, -89, -46, -75,  -4, -50,  38, -18,  73,  18,  90,  50,  85,  75,  61,  89,  22,  // 4
+ 89, -22,  75, -61,  50, -85,  18, -90, -18, -73, -50, -38, -75,   4, -89,  46, -89,  78, -75,  90, -50,  82, -18,  54,  18,  13,  50, -31,  75, -67,  89, -88,
+ 87,  85,  57,  46,   9, -13, -43, -67, -80, -90, -90, -73, -70, -22, -25,  38,  25,  82,  70,  88,  90,  54,  80,  -4,  43, -61,  -9, -90, -57, -78, -87, -31,  // 6
+-87,  31, -57,  78,  -9,  90,  43,  61,  80,   4,  90, -54,  70, -88,  25, -82, -25, -38, -70,  22, -90,  73, -80,  90, -43,  67,   9,  13,  57, -46,  87, -85,
+ 83,  82,  36,  22, -36, -54, -83, -90, -83, -61, -36,  13,  36,  78,  83,  85,  83,  31,  36, -46, -36, -90, -83, -67, -83,   4, -36,  73,  36,  88,  83,  38,  // 8
+ 83, -38,  36, -88, -36, -73, -83,  -4, -83,  67, -36,  90,  36,  46,  83, -31,  83, -85,  36, -78, -36, -13, -83,  61, -83,  90, -36,  54,  36, -22,  83, -82,
+ 80,  78,   9,  -4, -70, -82, -87, -73, -25,  13,  57,  85,  90,  67,  43, -22, -43, -88, -90, -61, -57,  31,  25,  90,  87,  54,  70, -38,  -9, -90, -80, -46,  // 10
+-80,  46,  -9,  90,  70,  38,  87, -54,  25, -90, -57, -31, -90,  61, -43,  88,  43,  22,  90, -67,  57, -85, -25, -13, -87,  73, -70,  82,   9,   4,  80, -78,
+ 75,  73, -18, -31, -89, -90, -50, -22,  50,  78,  89,  67,  18, -38, -75, -90, -75, -13,  18,  82,  89,  61,  50, -46, -50, -88, -89,  -4, -18,  85,  75,  54,  // 12
+ 75, -54, -18, -85, -89,   4, -50,  88,  50,  46,  89, -61,  18, -82, -75,  13, -75,  90,  18,  38,  89, -67,  50, -78, -50,  22, -89,  90, -18,  31,  75, -73,
+ 70,  67, -43, -54, -87, -78,   9,  38,  90,  85,  25, -22, -80, -90, -57,   4,  57,  90,  80,  13, -25, -88, -90, -31,  -9,  82,  87,  46,  43, -73, -70, -61,  // 14
+-70,  61,  43,  73,  87, -46,  -9, -82, -90,  31, -25,  88,  80, -13,  57, -90, -57,  -4, -80,  90,  25,  22,  90, -85,   9, -38, -87,  78, -43,  54,  70, -67,
+ 64,  61, -64, -73, -64, -46,  64,  82,  64,  31, -64, -88, -64, -13,  64,  90,  64,  -4, -64, -90, -64,  22,  64,  85,  64, -38, -64, -78, -64,  54,  64,  67,  // 16
+ 64, -67, -64, -54, -64,  78,  64,  38,  64, -85, -64, -22, -64,  90,  64,   4,  64, -90, -64,  13, -64,  88,  64, -31,  64, -82, -64,  46, -64,  73,  64, -61,
+ 57,  54, -80, -85, -25,  -4,  90,  88,  -9, -46, -87, -61,  43,  82,  70,  13, -70, -90, -43,  38,  87,  67,   9, -78, -90, -22,  25,  90,  80, -31, -57, -73,  // 18
+-57,  73,  80,  31,  25, -90, -90,  22,   9,  78,  87, -67, -43, -38, -70,  90,  70, -13,  43, -82, -87,  61,  -9,  46,  90, -88, -25,   4, -80,  85,  57, -54,
+ 50,  46, -89, -90,  18,  38,  75,  54, -75, -90, -18,  31,  89,  61, -50, -88, -50,  22,  89,  67, -18, -85, -75,  13,  75,  73,  18, -82, -89,   4,  50,  78,  // 20
+ 50, -78, -89,  -4,  18,  82,  75, -73, -75, -13, -18,  85,  89, -67, -50, -22, -50,  88,  89, -61, -18, -31, -75,  90,  75, -54,  18, -38, -89,  90,  50, -46,
+ 43,  38, -90, -88,  57,  73,  25,  -4, -87, -67,  70,  90,   9, -46, -80, -31,  80,  85,  -9, -78, -70,  13,  87,  61, -25, -90, -57,  54,  90,  22, -43, -82,  // 22
+-43,  82,  90, -22, -57, -54, -25,  90,  87, -61, -70, -13,  -9,  78,  80, -85, -80,  31,   9,  46,  70, -90, -87,  67,  25,   4,  57, -73, -90,  88,  43, -38,
+ 36,  31, -83, -78,  83,  90, -36, -61, -36,   4,  83,  54, -83, -88,  36,  82,  36, -38, -83, -22,  83,  73, -36, -90, -36,  67,  83, -13, -83, -46,  36,  85,  // 24
+ 36, -85, -83,  46,  83,  13, -36, -67, -36,  90,  83, -73, -83,  22,  36,  38,  36, -82, -83,  88,  83, -54, -36,  -4, -36,  61,  83, -90, -83,  78,  36, -31,
+ 25,  22, -70, -61,  90,  85, -80, -90,  43,  73,   9, -38, -57,  -4,  87,  46, -87, -78,  57,  90,  -9, -82, -43,  54,  80, -13, -90, -31,  70,  67, -25, -88,  // 26
+-25,  88,  70, -67, -90,  31,  80,  13, -43, -54,  -9,  82,  57, -90, -87,  78,  87, -46, -57,   4,   9,  38,  43, -73, -80,  90,  90, -85, -70,  61,  25, -22,
+ 18,  13, -50, -38,  75,  61, -89, -78,  89,  88, -75, -90,  50,  85, -18, -73, -18,  54,  50, -31, -75,   4,  89,  22, -89, -46,  75,  67, -50, -82,  18,  90,  // 28
+ 18, -90, -50,  82,  75, -67, -89,  46,  89, -22, -75,  -4,  50,  31, -18, -54, -18,  73,  50, -85, -75,  90,  89, -88, -89,  78,  75, -61, -50,  38,  18, -13,
+  9,   4, -25, -13,  43,  22, -57, -31,  70,  38, -80, -46,  87,  54, -90, -61,  90,  67, -87, -73,  80,  78, -70, -82,  57,  85, -43, -88,  25,  90,  -9, -90,  // 30
+ -9,  90,  25, -90, -43,  88,  57, -85, -70,  82,  80, -78, -87,  73,  90, -67, -90,  61,  87, -54, -80,  46,  70, -38, -57,  31,  43, -22, -25,  13,   9,  -4,
+};
+
+
+ALIGNED(32) const int16_t  fi_dst7_32xN_coeff_hor[1024] = {
+ 4,  13,   9,  26,  13,  38,  17,  50,  21,  60,  26,  68,  30,  77,  34,  82,  38,  86,  42,  89,  46,  90,  50,  88,  53,  85,  56,  80,  60,  74,  63,  66,  // 0
+ 66,  56,  68,  46,  72,  34,  74,  21,  77,   9,  78,  -4,  80, -17,  82, -30,  84, -42,  85, -53,  86, -63,  87, -72,  88, -78,  89, -84,  90, -87,  90, -90,
+ 21,  30,  42,  56,  60,  77,  74,  87,  84,  89,  89,  80,  89,  63,  84,  38,  74,   9,  60, -21,  42, -50,  21, -72,   0, -85, -21, -90, -42, -84, -60, -68,  // 2
+-74, -46, -84, -17, -89,  13, -89,  42, -84,  66, -74,  82, -60,  90, -42,  86, -21,  74,   0,  53,  21,  26,  42,  -4,  60, -34,  74, -60,  84, -78,  89, -88,
+ 38,  46,  68,  78,  86,  90,  88,  77,  74,  42,  46,  -4,   9, -50, -30, -80, -63, -90, -84, -74, -90, -38, -78,   9, -53,  53, -17,  82,  21,  89,  56,  72,  // 4
+ 80,  34,  90, -13,  82, -56,  60, -84,  26, -88, -13, -68, -50, -30, -77,  17, -89,  60, -85,  85, -66,  87, -34,  66,   4,  26,  42, -21,  72, -63,  87, -86,
+ 53,  60,  85,  89,  85,  74,  53,  21,   0, -42, -53, -84, -85, -84, -85, -42, -53,  21,   0,  74,  53,  89,  85,  60,  85,   0,  53, -60,   0, -89, -53, -74,  // 6
+-85, -21, -85,  42, -53,  84,   0,  84,  53,  42,  85, -21,  85, -74,  53, -89,   0, -60, -53,   0, -85,  60, -85,  89, -53,  74,   0,  21,  53, -42,  85, -84,
+ 66,  72,  90,  86,  56,  34, -13, -46, -74, -89, -87, -63, -46,  13,  26,  78,  80,  82,  84,  21,  34, -56, -38, -90, -85, -53, -78,  26, -21,  84,  50,  77,  // 8
+ 88,   9,  72, -66,   9, -88, -60, -42, -90,  38, -63,  87,   4,  68,  68,  -4,  89, -74,  53, -85, -17, -30, -77,  50, -86,  90, -42,  60,  30, -17,  82, -80,
+ 77,  80,  80,  72,   9, -17, -72, -86, -84, -60, -17,  34,  66,  90,  86,  46,  26, -50, -60, -89, -88, -30, -34,  63,  53,  85,  90,  13,  42, -74, -46, -78,  // 10
+-90,   4, -50,  82,  38,  68,  89, -21,  56, -87, -30, -56, -87,  38, -63,  90,  21,  42,  85, -53,  68, -88, -13, -26, -82,  66, -74,  84,   4,   9,  78, -77,
+ 84,  86,  60,  46, -42, -63, -89, -78, -21,  21,  74,  90,  74,  26, -21, -77, -89, -66, -42,  42,  60,  87,  84,   4,   0, -85, -84, -50, -60,  60,  42,  80,  // 12
+ 89, -17,  21, -90, -74, -30, -74,  74,  21,  68,  89, -38,  42, -88, -60,  -9, -84,  84,   0,  53,  84, -56,  60, -82, -42,  13, -89,  89, -21,  34,  74, -72,
+ 88,  90,  30,  13, -78, -87, -56, -26,  60,  84,  77,  38, -34, -78, -87, -50,   4,  72,  89,  60,  26, -63, -80, -68, -53,  53,  63,  77,  74, -42, -38, -82,  // 14
+-86,  30,   9,  86,  90, -17,  21, -89, -82,   4, -50,  90,  66,   9,  72, -88, -42, -21, -85,  85,  13,  34,  90, -80,  17, -46, -84,  74, -46,  56,  68, -66,
+ 90,  89,  -4, -21, -90, -84,   9,  42,  89,  74, -13, -60, -88, -60,  17,  74,  87,  42, -21, -84, -86, -21,  26,  89,  85,   0, -30, -89, -84,  21,  34,  84,  // 16
+ 82, -42, -38, -74, -80,  60,  42,  60,  78, -74, -46, -42, -77,  84,  50,  21,  74, -89, -53,   0, -72,  89,  56, -21,  68, -84, -60,  42, -66,  74,  63, -60,
+ 87,  85, -38, -53, -72, -53,  68,  85,  42,   0, -86, -85,  -4,  53,  88,  53, -34, -85, -74,   0,  66,  85,  46, -53, -85, -53,  -9,  85,  89,   0, -30, -85,  // 18
+-77,  53,  63,  53,  50, -85, -84,   0, -13,  85,  90, -53, -26, -53, -78,  85,  60,   0,  53, -85, -82,  53, -17,  53,  90, -85, -21,   0, -80,  85,  56, -53,
+ 82,  78, -66, -77, -30,  -4,  90,  80, -42, -74, -56,  -9,  86,  82, -13, -72, -77, -13,  74,  84,  17, -68, -87, -17,  53,  85,  46, -66, -89, -21,  26,  86,  // 20
+ 68, -63, -80, -26,  -4,  87,  84, -60, -63, -30, -34,  88,  90, -56, -38, -34, -60,  89,  85, -53,  -9, -38, -78,  90,  72, -50,  21, -42, -88,  90,  50, -46,
+ 74,  68, -84, -88,  21,  46,  60,  30, -89, -84,  42,  78,  42, -17, -89, -56,  60,  90,  21, -60, -84, -13,  74,  77,   0, -85, -74,  34,  84,  42, -21, -87,  // 22
+-60,  72,  89,  -4, -42, -66, -42,  89,  89, -50, -60, -26, -21,  82,  84, -80, -74,  21,   0,  53,  74, -90, -84,  63,  21,   9,  60, -74, -89,  86,  42, -38,
+ 63,  56, -90, -87,  66,  80,  -4, -38, -60, -21,  90,  72, -68, -90,   9,  68,  56, -17, -89, -42,  72,  82, -13, -86, -53,  53,  88,   4, -74, -60,  17,  88,  // 24
+ 50, -78, -87,  34,  77,  26, -21, -74, -46,  90,  86, -66, -78,  13,  26,  46,  42, -84, -85,  85,  80, -50, -30,  -9, -38,  63,  84, -89, -82,  77,  34, -30,
+ 50,  42, -82, -74,  88,  89, -66, -84,  21,  60,  30, -21, -72, -21,  90,  60, -78, -84,  42,  89,   9, -74, -56,  42,  85,   0, -86, -42,  60,  74, -13, -89,  // 26
+-38,  84,  77, -60, -90,  21,  74,  21, -34, -60, -17,  84,  63, -89, -87,  74,  84, -42, -53,   0,   4,  42,  46, -74, -80,  89,  89, -84, -68,  60,  26, -21,
+ 34,  26, -63, -50,  82,  68, -90, -82,  84,  89, -66, -88,  38,  80,  -4, -66, -30,  46,  60, -21, -80,  -4,  90,  30, -85, -53,  68,  72, -42, -84,   9,  90,  // 28
+ 26, -87, -56,  78,  78, -63, -89,  42,  86, -17, -72,  -9,  46,  34, -13, -56, -21,  74,  53, -85, -77,  90,  88, -86, -87,  77,  74, -60, -50,  38,  17, -13,
+ 17,   9, -34, -17,  50,  26, -63, -34,  74,  42, -82, -50,  87,  56, -90, -63,  88,  68, -84, -74,  77,  78, -66, -82,  53,  85, -38, -87,  21,  89,  -4, -90,  // 30
+-13,  90,  30, -88, -46,  86,  60, -84, -72,  80,  80, -77, -86,  72,  90, -66, -89,  60,  85, -53, -78,  46,  68, -38, -56,  30,  42, -21, -26,  13,   9,  -4,
+};
+
+
+ALIGNED(32) const int16_t  fi_dct8_32xN_coeff_hor[1024] = {
+90,  90,  90,  87,  89,  84,  88,  78,  87,  72,  86,  63,  85,  53,  84,  42,  82,  30,  80,  17,  78,   4,  77,  -9,  74, -21,  72, -34,  68, -46,  66, -56,  // 0
+ 63, -66,  60, -74,  56, -80,  53, -85,  50, -88,  46, -90,  42, -89,  38, -86,  34, -82,  30, -77,  26, -68,  21, -60,  17, -50,  13, -38,   9, -26,   4, -13,
+ 89,  88,  84,  78,  74,  60,  60,  34,  42,   4,  21, -26,   0, -53, -21, -74, -42, -86, -60, -90, -74, -82, -84, -66, -89, -42, -89, -13, -84,  17, -74,  46,  // 2
+-60,  68, -42,  84, -21,  90,   0,  85,  21,  72,  42,  50,  60,  21,  74,  -9,  84, -38,  89, -63,  89, -80,  84, -89,  74, -87,  60, -77,  42, -56,  21, -30,
+ 87,  86,  72,  63,  42,  21,   4, -26, -34, -66, -66, -87, -85, -85, -89, -60, -77, -17, -50,  30, -13,  68,  26,  88,  60,  84,  82,  56,  90,  13,  80, -34,  // 4
+ 56, -72,  21, -89, -17, -82, -53, -53, -78,  -9, -90,  38, -84,  74, -63,  90, -30,  80,   9,  50,  46,   4,  74, -42,  88, -77,  86, -90,  68, -78,  38, -46,
+ 85,  84,  53,  42,   0, -21, -53, -74, -85, -89, -85, -60, -53,   0,   0,  60,  53,  89,  85,  74,  85,  21,  53, -42,   0, -84, -53, -84, -85, -42, -85,  21,  // 6
+-53,  74,   0,  89,  53,  60,  85,   0,  85, -60,  53, -89,   0, -74, -53, -21, -85,  42, -85,  84, -53,  84,   0,  42,  53, -21,  85, -74,  85, -89,  53, -60,
+ 82,  80,  30,  17, -42, -60, -86, -90, -77, -50, -17,  30,  53,  85,  89,  74,  68,   4,   4, -68, -63, -87, -90, -38, -60,  42,   9,  88,  72,  66,  88,  -9,  // 8
+ 50, -77, -21, -84, -78, -26, -85,  53, -38,  90,  34,  56,  84, -21,  80, -82,  26, -78, -46, -13, -87,  63, -74,  89, -13,  46,  56, -34,  90, -86,  66, -72,
+ 78,  77,   4,  -9, -74, -84, -82, -66, -13,  26,  68,  88,  85,  53,  21, -42, -63, -90, -87, -38, -30,  56,  56,  87,  89,  21,  38, -68, -50, -82, -90,  -4,  // 10
+-46,  78,  42,  74,  90, -13,  53, -85, -34, -63, -88,  30, -60,  89,  26,  50,  86, -46,  66, -90, -17, -34, -84,  60, -72,  86,   9,  17,  80, -72,  77, -80,
+ 74,  72, -21, -34, -89, -89, -42, -13,  60,  82,  84,  56,   0, -53, -84, -84, -60,   9,  42,  88,  89,  38,  21, -68, -74, -74, -74,  30,  21,  90,  89,  17,  // 12
+ 42, -80, -60, -60, -84,  50,   0,  85,  84,  -4,  60, -87, -42, -42, -89,  66, -21,  77,  74, -26,  74, -90, -21, -21, -89,  78, -42,  63,  60, -46,  84, -86,
+ 68,  66, -46, -56, -84, -74,  17,  46,  90,  80,  13, -34, -85, -85, -42,  21,  72,  88,  66,  -9, -50, -90, -82,  -4,  21,  89,  90,  17,   9, -86, -86, -30,  // 14
+-38,  82,  74,  42,  63, -77, -53, -53, -80,  68,  26,  63,  89, -60,   4, -72, -87,  50, -34,  78,  77, -38,  60, -84, -56,  26, -78,  87,  30, -13,  88, -90,
+ 63,  60, -66, -74, -60, -42,  68,  84,  56,  21, -72, -89, -53,   0,  74,  89,  50, -21, -77, -84, -46,  42,  78,  74,  42, -60, -80, -60, -38,  74,  82,  42,  // 16
+ 34, -84, -84, -21, -30,  89,  85,   0,  26, -89, -86,  21, -21,  84,  87, -42,  17, -74, -88,  60, -13,  60,  89, -74,   9, -42, -90,  84,  -4,  21,  90, -89,
+ 56,  53, -80, -85, -21,   0,  90,  85, -17, -53, -82, -53,  53,  85,  60,   0, -78, -85, -26,  53,  90,  53, -13, -85, -84,   0,  50,  85,  63, -53, -77, -53,  // 18
+-30,  85,  89,   0,  -9, -85, -85,  53,  46,  53,  66, -85, -74,   0, -34,  85,  88, -53,  -4, -53, -86,  85,  42,   0,  68, -85, -72,  53, -38,  53,  87, -85,
+ 50,  46, -88, -90,  21,  42,  72,  50, -78, -90,  -9,  38,  85,  53, -60, -89, -38,  34,  90,  56, -34, -88, -63,  30,  84,  60,  -4, -87, -80,  26,  68,  63,  // 20
+ 26, -86, -89,  21,  46,  66,  53, -85, -87,  17,  17,  68,  74, -84, -77,  13, -13,  72,  86, -82, -56,   9, -42,  74,  90, -80, -30,   4, -66,  77,  82, -78,
+ 42,  38, -89, -86,  60,  74,  21,  -9, -84, -63,  74,  90,   0, -53, -74, -21,  84,  80, -21, -82, -60,  26,  89,  50, -42, -89, -42,  66,  89,   4, -60, -72,  // 22
+-21,  87,  84, -42, -74, -34,   0,  85,  74, -77, -84,  13,  21,  60,  60, -90, -89,  56,  42,  17,  42, -78, -89,  84,  60, -30,  21, -46, -84,  88,  74, -68,
+ 34,  30, -82, -77,  84,  89, -38, -63, -30,   9,  80,  50, -85, -85,  42,  84,  26, -46, -78, -13,  86,  66, -46, -90, -21,  74,  77, -26, -87, -34,  50,  78,  // 24
+ 17, -88, -74,  60,  88,  -4, -53, -53, -13,  86,  72, -82, -89,  42,  56,  17,   9, -68, -68,  90,  90, -72, -60,  21,  -4,  38,  66, -80, -90,  87,  63, -56,
+ 26,  21, -68, -60,  89,  84, -80, -89,  46,  74,   4, -42, -53,   0,  84,  42, -87, -74,  63,  89, -17, -84, -34,  60,  74, -21, -90, -21,  77,  60, -38, -84,  // 26
+-13,  89,  60, -74, -86,  42,  85,   0, -56, -42,   9,  74,  42, -89, -78,  84,  90, -60, -72,  21,  30,  21,  21, -60, -66,  84,  88, -89, -82,  74,  50, -42,
+ 17,  13, -50, -38,  74,  60, -87, -77,  88,  86, -77, -90,  53,  85, -21, -74, -13,  56,  46, -34, -72,   9,  86,  17, -89, -42,  78,  63, -56, -78,  26,  87,  // 28
+  9, -90, -42,  84,  68, -72, -85,  53,  90, -30, -80,   4,  60,  21, -30, -46,  -4,  66,  38, -80, -66,  88,  84, -89, -90,  82,  82, -68, -63,  50,  34, -26,
+  9,   4, -26, -13,  42,  21, -56, -30,  68,  38, -78, -46,  85,  53, -89, -60,  90,  66, -86, -72,  80,  77, -72, -80,  60,  84, -46, -86,  30,  88, -13, -90,  // 30
+ -4,  90,  21, -89, -38,  87,  53, -85, -66,  82,  77, -78, -84,  74,  88, -68, -90,  63,  87, -56, -82,  50,  74, -42, -63,  34,  50, -26, -34,  17,  17,  -9,
+};
+
+const int16_t ff_dct8_4x32_coeff_ver[1024] = {
+90,  90,  90,  87,  89,  84,  88,  78,  87,  72,  86,  63,  85,  53,  84,  42,  82,  30,  80,  17,  78,   4,  77,  -9,  74, -21,  72, -34,  68, -46,  66, -56,  // 0
+ 63, -66,  60, -74,  56, -80,  53, -85,  50, -88,  46, -90,  42, -89,  38, -86,  34, -82,  30, -77,  26, -68,  21, -60,  17, -50,  13, -38,   9, -26,   4, -13,
+ 89,  88,  84,  78,  74,  60,  60,  34,  42,   4,  21, -26,   0, -53, -21, -74, -42, -86, -60, -90, -74, -82, -84, -66, -89, -42, -89, -13, -84,  17, -74,  46,  // 2
+-60,  68, -42,  84, -21,  90,   0,  85,  21,  72,  42,  50,  60,  21,  74,  -9,  84, -38,  89, -63,  89, -80,  84, -89,  74, -87,  60, -77,  42, -56,  21, -30,
+ 87,  86,  72,  63,  42,  21,   4, -26, -34, -66, -66, -87, -85, -85, -89, -60, -77, -17, -50,  30, -13,  68,  26,  88,  60,  84,  82,  56,  90,  13,  80, -34,  // 4
+ 56, -72,  21, -89, -17, -82, -53, -53, -78,  -9, -90,  38, -84,  74, -63,  90, -30,  80,   9,  50,  46,   4,  74, -42,  88, -77,  86, -90,  68, -78,  38, -46,
+ 85,  84,  53,  42,   0, -21, -53, -74, -85, -89, -85, -60, -53,   0,   0,  60,  53,  89,  85,  74,  85,  21,  53, -42,   0, -84, -53, -84, -85, -42, -85,  21,  // 6
+-53,  74,   0,  89,  53,  60,  85,   0,  85, -60,  53, -89,   0, -74, -53, -21, -85,  42, -85,  84, -53,  84,   0,  42,  53, -21,  85, -74,  85, -89,  53, -60,
+ 82,  80,  30,  17, -42, -60, -86, -90, -77, -50, -17,  30,  53,  85,  89,  74,  68,   4,   4, -68, -63, -87, -90, -38, -60,  42,   9,  88,  72,  66,  88,  -9,  // 8
+ 50, -77, -21, -84, -78, -26, -85,  53, -38,  90,  34,  56,  84, -21,  80, -82,  26, -78, -46, -13, -87,  63, -74,  89, -13,  46,  56, -34,  90, -86,  66, -72,
+ 78,  77,   4,  -9, -74, -84, -82, -66, -13,  26,  68,  88,  85,  53,  21, -42, -63, -90, -87, -38, -30,  56,  56,  87,  89,  21,  38, -68, -50, -82, -90,  -4,  // 10
+-46,  78,  42,  74,  90, -13,  53, -85, -34, -63, -88,  30, -60,  89,  26,  50,  86, -46,  66, -90, -17, -34, -84,  60, -72,  86,   9,  17,  80, -72,  77, -80,
+ 74,  72, -21, -34, -89, -89, -42, -13,  60,  82,  84,  56,   0, -53, -84, -84, -60,   9,  42,  88,  89,  38,  21, -68, -74, -74, -74,  30,  21,  90,  89,  17,  // 12
+ 42, -80, -60, -60, -84,  50,   0,  85,  84,  -4,  60, -87, -42, -42, -89,  66, -21,  77,  74, -26,  74, -90, -21, -21, -89,  78, -42,  63,  60, -46,  84, -86,
+ 68,  66, -46, -56, -84, -74,  17,  46,  90,  80,  13, -34, -85, -85, -42,  21,  72,  88,  66,  -9, -50, -90, -82,  -4,  21,  89,  90,  17,   9, -86, -86, -30,  // 14
+-38,  82,  74,  42,  63, -77, -53, -53, -80,  68,  26,  63,  89, -60,   4, -72, -87,  50, -34,  78,  77, -38,  60, -84, -56,  26, -78,  87,  30, -13,  88, -90,
+ 63,  60, -66, -74, -60, -42,  68,  84,  56,  21, -72, -89, -53,   0,  74,  89,  50, -21, -77, -84, -46,  42,  78,  74,  42, -60, -80, -60, -38,  74,  82,  42,  // 16
+ 34, -84, -84, -21, -30,  89,  85,   0,  26, -89, -86,  21, -21,  84,  87, -42,  17, -74, -88,  60, -13,  60,  89, -74,   9, -42, -90,  84,  -4,  21,  90, -89,
+ 56,  53, -80, -85, -21,   0,  90,  85, -17, -53, -82, -53,  53,  85,  60,   0, -78, -85, -26,  53,  90,  53, -13, -85, -84,   0,  50,  85,  63, -53, -77, -53,  // 18
+-30,  85,  89,   0,  -9, -85, -85,  53,  46,  53,  66, -85, -74,   0, -34,  85,  88, -53,  -4, -53, -86,  85,  42,   0,  68, -85, -72,  53, -38,  53,  87, -85,
+ 50,  46, -88, -90,  21,  42,  72,  50, -78, -90,  -9,  38,  85,  53, -60, -89, -38,  34,  90,  56, -34, -88, -63,  30,  84,  60,  -4, -87, -80,  26,  68,  63,  // 20
+ 26, -86, -89,  21,  46,  66,  53, -85, -87,  17,  17,  68,  74, -84, -77,  13, -13,  72,  86, -82, -56,   9, -42,  74,  90, -80, -30,   4, -66,  77,  82, -78,
+ 42,  38, -89, -86,  60,  74,  21,  -9, -84, -63,  74,  90,   0, -53, -74, -21,  84,  80, -21, -82, -60,  26,  89,  50, -42, -89, -42,  66,  89,   4, -60, -72,  // 22
+-21,  87,  84, -42, -74, -34,   0,  85,  74, -77, -84,  13,  21,  60,  60, -90, -89,  56,  42,  17,  42, -78, -89,  84,  60, -30,  21, -46, -84,  88,  74, -68,
+ 34,  30, -82, -77,  84,  89, -38, -63, -30,   9,  80,  50, -85, -85,  42,  84,  26, -46, -78, -13,  86,  66, -46, -90, -21,  74,  77, -26, -87, -34,  50,  78,  // 24
+ 17, -88, -74,  60,  88,  -4, -53, -53, -13,  86,  72, -82, -89,  42,  56,  17,   9, -68, -68,  90,  90, -72, -60,  21,  -4,  38,  66, -80, -90,  87,  63, -56,
+ 26,  21, -68, -60,  89,  84, -80, -89,  46,  74,   4, -42, -53,   0,  84,  42, -87, -74,  63,  89, -17, -84, -34,  60,  74, -21, -90, -21,  77,  60, -38, -84,  // 26
+-13,  89,  60, -74, -86,  42,  85,   0, -56, -42,   9,  74,  42, -89, -78,  84,  90, -60, -72,  21,  30,  21,  21, -60, -66,  84,  88, -89, -82,  74,  50, -42,
+ 17,  13, -50, -38,  74,  60, -87, -77,  88,  86, -77, -90,  53,  85, -21, -74, -13,  56,  46, -34, -72,   9,  86,  17, -89, -42,  78,  63, -56, -78,  26,  87,  // 28
+  9, -90, -42,  84,  68, -72, -85,  53,  90, -30, -80,   4,  60,  21, -30, -46,  -4,  66,  38, -80, -66,  88,  84, -89, -90,  82,  82, -68, -63,  50,  34, -26,
+  9,   4, -26, -13,  42,  21, -56, -30,  68,  38, -78, -46,  85,  53, -89, -60,  90,  66, -86, -72,  80,  77, -72, -80,  60,  84, -46, -86,  30,  88, -13, -90,  // 30
+ -4,  90,  21, -89, -38,  87,  53, -85, -66,  82,  77, -78, -84,  74,  88, -68, -90,  63,  87, -56, -82,  50,  74, -42, -63,  34,  50, -26, -34,  17,  17,  -9,
+};
+const int16_t ff_dst7_4x32_coeff_ver[1024] = {
+  4,   9,  13,  26,  21,  42,  30,  56,  38,  68,  46,  78,  53,  85,  60,  89,  66,  90,  72,  86,  77,  80,  80,  72,  84,  60,  86,  46,  88,  30,  90,  13,  // 0
+ 90,  -4,  89, -21,  87, -38,  85, -53,  82, -66,  78, -77,  74, -84,  68, -88,  63, -90,  56, -87,  50, -82,  42, -74,  34, -63,  26, -50,  17, -34,   9, -17,
+ 13,  17,  38,  50,  60,  74,  77,  87,  86,  88,  90,  77,  85,  53,  74,  21,  56, -13,  34, -46,   9, -72, -17, -86, -42, -89, -63, -78, -78, -56, -87, -26,  // 2
+-90,   9, -84,  42, -72,  68, -53,  85, -30,  90,  -4,  80,  21,  60,  46,  30,  66,  -4,  80, -38,  88, -66,  89, -84,  82, -90,  68, -82,  50, -63,  26, -34,
+ 21,  26,  60,  68,  84,  89,  89,  80,  74,  46,  42,  -4,   0, -53, -42, -84, -74, -87, -89, -63, -84, -17, -60,  34, -21,  74,  21,  90,  60,  77,  84,  38,  // 4
+ 89, -13,  74, -60,  42, -86,   0, -85, -42, -56, -74,  -9, -89,  42, -84,  78, -60,  90, -21,  72,  21,  30,  60, -21,  84, -66,  89, -88,  74, -82,  42, -50,
+ 30,  34,  77,  82,  89,  84,  63,  38,   9, -30, -50, -80, -85, -85, -84, -42, -46,  26,  13,  78,  66,  86,  90,  46,  74, -21,  26, -77, -34, -87, -78, -50,  // 6
+-88,  17, -60,  74,  -4,  88,  53,  53,  86, -13,  82, -72,  42, -89, -17, -56, -68,   9, -90,  68, -72,  90, -21,  60,  38,  -4,  80, -66,  87, -90,  56, -63,
+ 38,  42,  86,  89,  74,  60,   9, -21, -63, -84, -90, -74, -53,   0,  21,  74,  80,  84,  82,  21,  26, -60, -50, -89, -89, -42, -66,  42,   4,  89,  72,  60,  // 8
+ 87, -21,  42, -84, -34, -74, -85,   0, -77,  74, -13,  84,  60,  21,  90, -60,  56, -89, -17, -42, -78,  42, -84,  89, -30,  60,  46, -21,  88, -84,  68, -74,
+ 46,  50,  90,  88,  42,  21, -50, -72, -90, -78, -38,   9,  53,  85,  89,  60,  34, -38, -56, -90, -88, -34, -30,  63,  60,  84,  87,   4,  26, -80, -63, -68,  // 10
+-86,  26, -21,  89,  66,  46,  85, -53,  17, -87, -68, -17, -84,  74, -13,  77,  72, -13,  82, -86,   9, -56, -74,  42, -80,  90,  -4,  30,  77, -66,  78, -82,
+ 53,  56,  85,  80,   0, -21, -85, -90, -53, -17,  53,  82,  85,  53,   0, -60, -85, -78, -53,  26,  53,  90,  85,  13,   0, -84, -85, -50, -53,  63,  53,  77,  // 12
+ 85, -30,   0, -89, -85,  -9, -53,  85,  53,  46,  85, -66,   0, -74, -85,  34, -53,  88,  53,   4,  85, -86,   0, -42, -85,  68, -53,  72,  53, -38,  85, -87,
+ 60,  63,  74,  66, -42, -60, -84, -68,  21,  56,  89,  72,   0, -53, -89, -74, -21,  50,  84,  77,  42, -46, -74, -78, -60,  42,  60,  80,  74, -38, -42, -82,  // 14
+-84,  34,  21,  84,  89, -30,   0, -85, -89,  26, -21,  86,  84, -21,  42, -87, -74,  17, -60,  88,  60, -13,  74, -89, -42,   9, -84,  90,  21,  -4,  89, -90,
+ 66,  68,  56,  46, -74, -84, -46, -17,  80,  90,  34, -13, -85, -85, -21,  42,  88,  72,   9, -66, -90, -50,   4,  82,  89,  21, -17, -90, -86,   9,  30,  86,  // 16
+ 82, -38, -42, -74, -77,  63,  53,  53,  68, -80, -63, -26, -60,  89,  72,  -4,  50, -87, -78,  34, -38,  77,  84, -60,  26, -56, -87,  78, -13,  30,  90, -88,
+ 72,  74,  34,  21, -89, -89,  13,  42,  82,  60, -56, -84, -53,   0,  84,  84,   9, -60, -88, -42,  38,  89,  68, -21, -74, -74, -30,  74,  90,  21, -17, -89,  // 18
+-80,  42,  60,  60,  50, -84, -85,   0,  -4,  84,  87, -60, -42, -42, -66,  89,  77, -21,  26, -74, -90,  74,  21,  21,  78, -89, -63,  42, -46,  60,  86, -84,
+ 77,  78,   9,  -4, -84, -74,  66,  82,  26, -13, -88, -68,  53,  85,  42, -21, -90, -63,  38,  87,  56, -30, -87, -56,  21,  89,  68, -38, -82, -50,   4,  90,  // 20
+ 78, -46, -74, -42, -13,  90,  85, -53, -63, -34, -30,  88,  89, -60, -50, -26, -46,  86,  90, -66, -34, -17, -60,  84,  86, -72, -17,  -9, -72,  80,  80, -77,
+ 80,  82, -17, -30, -60, -42,  90,  86, -50, -77, -30,  17,  85,  53, -74, -89,   4,  68,  68,  -4, -87, -63,  38,  90,  42, -60, -88,  -9,  66,  72,   9, -88,  // 22
+-77,  50,  84,  21, -26, -78, -53,  85,  90, -38, -56, -34, -21,  84,  82, -80, -78,  26,  13,  46,  63, -87, -89,  74,  46, -13,  34, -56, -86,  90,  72, -66,
+ 84,  85, -42, -53, -21,   0,  74,  53, -89, -85,  60,  85,   0, -53, -60,   0,  89,  53, -74, -85,  21,  85,  42, -53, -84,   0,  84,  53, -42, -85, -21,  85,  // 24
+ 74, -53, -89,   0,  60,  53,   0, -85, -60,  85,  89, -53, -74,   0,  21,  53,  42, -85, -84,  85,  84, -53, -42,   0, -21,  53,  74, -85, -89,  85,  60, -53,
+ 86,  87, -63, -72,  21,  42,  26,  -4, -66, -34,  87,  66, -85, -85,  60,  89, -17, -77, -30,  50,  68, -13, -88, -26,  84,  60, -56, -82,  13,  90,  34, -80,  // 26
+-72,  56,  89, -21, -82, -17,  53,  53,  -9, -78, -38,  90,  74, -84, -90,  63,  80, -30, -50,  -9,   4,  46,  42, -74, -77,  88,  90, -86, -78,  68,  46, -38,
+ 88,  89, -78, -84,  60,  74, -34, -60,   4,  42,  26, -21, -53,   0,  74,  21, -86, -42,  90,  60, -82, -74,  66,  84, -42, -89,  13,  89,  17, -84, -46,  74,  // 28
+ 68, -60, -84,  42,  90, -21, -85,   0,  72,  21, -50, -42,  21,  60,   9, -74, -38,  84,  63, -89, -80,  89,  89, -84, -87,  74,  77, -60, -56,  42,  30, -21,
+ 90,  90, -87, -90,  84,  89, -78, -88,  72,  87, -63, -86,  53,  85, -42, -84,  30,  82, -17, -80,   4,  78,   9, -77, -21,  74,  34, -72, -46,  68,  56, -66,  // 30
+-66,  63,  74, -60, -80,  56,  85, -53, -88,  50,  90, -46, -89,  42,  86, -38, -82,  34,  77, -30, -68,  26,  60, -21, -50,  17,  38, -13, -26,   9,  13,  -4,
+};
+
+            const int16_t* ff_dct2_32x2_coeff_ver = ff_dct2_2xN_coeff_hor; // This is identical to existing table
+
+
+            const int16_t* fi_dct2_32x2_coeff_ver = ff_dct2_2xN_coeff_hor;
+
+
+ALIGNED(32) const int16_t  ff_dct2_32x4_butterfly_eo_row_coeff_hor[512] = {
+ 90,  90,  87,  87,  90,  90,  87,  87,  90,  90,  87,  87,  90,  90,  87,  87,  // 0
+ 80,  80,  70,  70,  80,  80,  70,  70,  80,  80,  70,  70,  80,  80,  70,  70,
+ 57,  57,  43,  43,  57,  57,  43,  43,  57,  57,  43,  43,  57,  57,  43,  43,
+ 25,  25,   9,   9,  25,  25,   9,   9,  25,  25,   9,   9,  25,  25,   9,   9,
+ 87,  87,  57,  57,  87,  87,  57,  57,  87,  87,  57,  57,  87,  87,  57,  57,
+  9,   9, -43, -43,   9,   9, -43, -43,   9,   9, -43, -43,   9,   9, -43, -43,
+-80, -80, -90, -90, -80, -80, -90, -90, -80, -80, -90, -90, -80, -80, -90, -90,
+-70, -70, -25, -25, -70, -70, -25, -25, -70, -70, -25, -25, -70, -70, -25, -25,
+ 80,  80,   9,   9,  80,  80,   9,   9,  80,  80,   9,   9,  80,  80,   9,   9,  // 8
+-70, -70, -87, -87, -70, -70, -87, -87, -70, -70, -87, -87, -70, -70, -87, -87,
+-25, -25,  57,  57, -25, -25,  57,  57, -25, -25,  57,  57, -25, -25,  57,  57,
+ 90,  90,  43,  43,  90,  90,  43,  43,  90,  90,  43,  43,  90,  90,  43,  43,
+ 70,  70, -43, -43,  70,  70, -43, -43,  70,  70, -43, -43,  70,  70, -43, -43,
+-87, -87,   9,   9, -87, -87,   9,   9, -87, -87,   9,   9, -87, -87,   9,   9,
+ 90,  90,  25,  25,  90,  90,  25,  25,  90,  90,  25,  25,  90,  90,  25,  25,
+-80, -80, -57, -57, -80, -80, -57, -57, -80, -80, -57, -57, -80, -80, -57, -57,
+ 57,  57, -80, -80,  57,  57, -80, -80,  57,  57, -80, -80,  57,  57, -80, -80,  // 16
+-25, -25,  90,  90, -25, -25,  90,  90, -25, -25,  90,  90, -25, -25,  90,  90,
+ -9,  -9, -87, -87,  -9,  -9, -87, -87,  -9,  -9, -87, -87,  -9,  -9, -87, -87,
+ 43,  43,  70,  70,  43,  43,  70,  70,  43,  43,  70,  70,  43,  43,  70,  70,
+ 43,  43, -90, -90,  43,  43, -90, -90,  43,  43, -90, -90,  43,  43, -90, -90,
+ 57,  57,  25,  25,  57,  57,  25,  25,  57,  57,  25,  25,  57,  57,  25,  25,
+-87, -87,  70,  70, -87, -87,  70,  70, -87, -87,  70,  70, -87, -87,  70,  70,
+  9,   9, -80, -80,   9,   9, -80, -80,   9,   9, -80, -80,   9,   9, -80, -80,
+ 25,  25, -70, -70,  25,  25, -70, -70,  25,  25, -70, -70,  25,  25, -70, -70,  // 24
+ 90,  90, -80, -80,  90,  90, -80, -80,  90,  90, -80, -80,  90,  90, -80, -80,
+ 43,  43,   9,   9,  43,  43,   9,   9,  43,  43,   9,   9,  43,  43,   9,   9,
+-57, -57,  87,  87, -57, -57,  87,  87, -57, -57,  87,  87, -57, -57,  87,  87,
+  9,   9, -25, -25,   9,   9, -25, -25,   9,   9, -25, -25,   9,   9, -25, -25,
+ 43,  43, -57, -57,  43,  43, -57, -57,  43,  43, -57, -57,  43,  43, -57, -57,
+ 70,  70, -80, -80,  70,  70, -80, -80,  70,  70, -80, -80,  70,  70, -80, -80,
+ 87,  87, -90, -90,  87,  87, -90, -90,  87,  87, -90, -90,  87,  87, -90, -90,
+};
+
+ALIGNED(32) const int16_t  ff_dct2_32x4_butterfly_o_row_coeff_hor[2048] = { // TODO: change this to 32-bit combined coeff table at some point, these huge tables are getting out of hand
+ 90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  // 0
+ 88, -88,  85, -85,  88, -88,  85, -85,  88, -88,  85, -85,  88, -88,  85, -85,
+ 82, -82,  78, -78,  82, -82,  78, -78,  82, -82,  78, -78,  82, -82,  78, -78,
+ 73, -73,  67, -67,  73, -73,  67, -67,  73, -73,  67, -67,  73, -73,  67, -67,
+ 61, -61,  54, -54,  61, -61,  54, -54,  61, -61,  54, -54,  61, -61,  54, -54,
+ 46, -46,  38, -38,  46, -46,  38, -38,  46, -46,  38, -38,  46, -46,  38, -38,
+ 31, -31,  22, -22,  31, -31,  22, -22,  31, -31,  22, -22,  31, -31,  22, -22,
+ 13, -13,   4,  -4,  13, -13,   4,  -4,  13, -13,   4,  -4,  13, -13,   4,  -4,
+ 90, -90,  82, -82,  90, -90,  82, -82,  90, -90,  82, -82,  90, -90,  82, -82,  // 8
+ 67, -67,  46, -46,  67, -67,  46, -46,  67, -67,  46, -46,  67, -67,  46, -46,
+ 22, -22,  -4,   4,  22, -22,  -4,   4,  22, -22,  -4,   4,  22, -22,  -4,   4,
+-31,  31, -54,  54, -31,  31, -54,  54, -31,  31, -54,  54, -31,  31, -54,  54,
+-73,  73, -85,  85, -73,  73, -85,  85, -73,  73, -85,  85, -73,  73, -85,  85,
+-90,  90, -88,  88, -90,  90, -88,  88, -90,  90, -88,  88, -90,  90, -88,  88,
+-78,  78, -61,  61, -78,  78, -61,  61, -78,  78, -61,  61, -78,  78, -61,  61,
+-38,  38, -13,  13, -38,  38, -13,  13, -38,  38, -13,  13, -38,  38, -13,  13,
+ 88, -88,  67, -67,  88, -88,  67, -67,  88, -88,  67, -67,  88, -88,  67, -67,  // 16
+ 31, -31, -13,  13,  31, -31, -13,  13,  31, -31, -13,  13,  31, -31, -13,  13,
+-54,  54, -82,  82, -54,  54, -82,  82, -54,  54, -82,  82, -54,  54, -82,  82,
+-90,  90, -78,  78, -90,  90, -78,  78, -90,  90, -78,  78, -90,  90, -78,  78,
+-46,  46,  -4,   4, -46,  46,  -4,   4, -46,  46,  -4,   4, -46,  46,  -4,   4,
+ 38, -38,  73, -73,  38, -38,  73, -73,  38, -38,  73, -73,  38, -38,  73, -73,
+ 90, -90,  85, -85,  90, -90,  85, -85,  90, -90,  85, -85,  90, -90,  85, -85,
+ 61, -61,  22, -22,  61, -61,  22, -22,  61, -61,  22, -22,  61, -61,  22, -22,
+ 85, -85,  46, -46,  85, -85,  46, -46,  85, -85,  46, -46,  85, -85,  46, -46,  // 24
+-13,  13, -67,  67, -13,  13, -67,  67, -13,  13, -67,  67, -13,  13, -67,  67,
+-90,  90, -73,  73, -90,  90, -73,  73, -90,  90, -73,  73, -90,  90, -73,  73,
+-22,  22,  38, -38, -22,  22,  38, -38, -22,  22,  38, -38, -22,  22,  38, -38,
+ 82, -82,  88, -88,  82, -82,  88, -88,  82, -82,  88, -88,  82, -82,  88, -88,
+ 54, -54,  -4,   4,  54, -54,  -4,   4,  54, -54,  -4,   4,  54, -54,  -4,   4,
+-61,  61, -90,  90, -61,  61, -90,  90, -61,  61, -90,  90, -61,  61, -90,  90,
+-78,  78, -31,  31, -78,  78, -31,  31, -78,  78, -31,  31, -78,  78, -31,  31,
+ 82, -82,  22, -22,  82, -82,  22, -22,  82, -82,  22, -22,  82, -82,  22, -22,  // 32
+-54,  54, -90,  90, -54,  54, -90,  90, -54,  54, -90,  90, -54,  54, -90,  90,
+-61,  61,  13, -13, -61,  61,  13, -13, -61,  61,  13, -13, -61,  61,  13, -13,
+ 78, -78,  85, -85,  78, -78,  85, -85,  78, -78,  85, -85,  78, -78,  85, -85,
+ 31, -31, -46,  46,  31, -31, -46,  46,  31, -31, -46,  46,  31, -31, -46,  46,
+-90,  90, -67,  67, -90,  90, -67,  67, -90,  90, -67,  67, -90,  90, -67,  67,
+  4,  -4,  73, -73,   4,  -4,  73, -73,   4,  -4,  73, -73,   4,  -4,  73, -73,
+ 88, -88,  38, -38,  88, -88,  38, -38,  88, -88,  38, -38,  88, -88,  38, -38,
+ 78, -78,  -4,   4,  78, -78,  -4,   4,  78, -78,  -4,   4,  78, -78,  -4,   4,  // 40
+-82,  82, -73,  73, -82,  82, -73,  73, -82,  82, -73,  73, -82,  82, -73,  73,
+ 13, -13,  85, -85,  13, -13,  85, -85,  13, -13,  85, -85,  13, -13,  85, -85,
+ 67, -67, -22,  22,  67, -67, -22,  22,  67, -67, -22,  22,  67, -67, -22,  22,
+-88,  88, -61,  61, -88,  88, -61,  61, -88,  88, -61,  61, -88,  88, -61,  61,
+ 31, -31,  90, -90,  31, -31,  90, -90,  31, -31,  90, -90,  31, -31,  90, -90,
+ 54, -54, -38,  38,  54, -54, -38,  38,  54, -54, -38,  38,  54, -54, -38,  38,
+-90,  90, -46,  46, -90,  90, -46,  46, -90,  90, -46,  46, -90,  90, -46,  46,
+ 73, -73, -31,  31,  73, -73, -31,  31,  73, -73, -31,  31,  73, -73, -31,  31,  // 48
+-90,  90, -22,  22, -90,  90, -22,  22, -90,  90, -22,  22, -90,  90, -22,  22,
+ 78, -78,  67, -67,  78, -78,  67, -67,  78, -78,  67, -67,  78, -78,  67, -67,
+-38,  38, -90,  90, -38,  38, -90,  90, -38,  38, -90,  90, -38,  38, -90,  90,
+-13,  13,  82, -82, -13,  13,  82, -82, -13,  13,  82, -82, -13,  13,  82, -82,
+ 61, -61, -46,  46,  61, -61, -46,  46,  61, -61, -46,  46,  61, -61, -46,  46,
+-88,  88,  -4,   4, -88,  88,  -4,   4, -88,  88,  -4,   4, -88,  88,  -4,   4,
+ 85, -85,  54, -54,  85, -85,  54, -54,  85, -85,  54, -54,  85, -85,  54, -54,
+ 67, -67, -54,  54,  67, -67, -54,  54,  67, -67, -54,  54,  67, -67, -54,  54,  // 56
+-78,  78,  38, -38, -78,  78,  38, -38, -78,  78,  38, -38, -78,  78,  38, -38,
+ 85, -85, -22,  22,  85, -85, -22,  22,  85, -85, -22,  22,  85, -85, -22,  22,
+-90,  90,   4,  -4, -90,  90,   4,  -4, -90,  90,   4,  -4, -90,  90,   4,  -4,
+ 90, -90,  13, -13,  90, -90,  13, -13,  90, -90,  13, -13,  90, -90,  13, -13,
+-88,  88, -31,  31, -88,  88, -31,  31, -88,  88, -31,  31, -88,  88, -31,  31,
+ 82, -82,  46, -46,  82, -82,  46, -46,  82, -82,  46, -46,  82, -82,  46, -46,
+-73,  73, -61,  61, -73,  73, -61,  61, -73,  73, -61,  61, -73,  73, -61,  61,
+ 61, -61, -73,  73,  61, -61, -73,  73,  61, -61, -73,  73,  61, -61, -73,  73,  // 64
+-46,  46,  82, -82, -46,  46,  82, -82, -46,  46,  82, -82, -46,  46,  82, -82,
+ 31, -31, -88,  88,  31, -31, -88,  88,  31, -31, -88,  88,  31, -31, -88,  88,
+-13,  13,  90, -90, -13,  13,  90, -90, -13,  13,  90, -90, -13,  13,  90, -90,
+ -4,   4, -90,  90,  -4,   4, -90,  90,  -4,   4, -90,  90,  -4,   4, -90,  90,
+ 22, -22,  85, -85,  22, -22,  85, -85,  22, -22,  85, -85,  22, -22,  85, -85,
+-38,  38, -78,  78, -38,  38, -78,  78, -38,  38, -78,  78, -38,  38, -78,  78,
+ 54, -54,  67, -67,  54, -54,  67, -67,  54, -54,  67, -67,  54, -54,  67, -67,
+ 54, -54, -85,  85,  54, -54, -85,  85,  54, -54, -85,  85,  54, -54, -85,  85,  // 72
+ -4,   4,  88, -88,  -4,   4,  88, -88,  -4,   4,  88, -88,  -4,   4,  88, -88,
+-46,  46, -61,  61, -46,  46, -61,  61, -46,  46, -61,  61, -46,  46, -61,  61,
+ 82, -82,  13, -13,  82, -82,  13, -13,  82, -82,  13, -13,  82, -82,  13, -13,
+-90,  90,  38, -38, -90,  90,  38, -38, -90,  90,  38, -38, -90,  90,  38, -38,
+ 67, -67, -78,  78,  67, -67, -78,  78,  67, -67, -78,  78,  67, -67, -78,  78,
+-22,  22,  90, -90, -22,  22,  90, -90, -22,  22,  90, -90, -22,  22,  90, -90,
+-31,  31, -73,  73, -31,  31, -73,  73, -31,  31, -73,  73, -31,  31, -73,  73,
+ 46, -46, -90,  90,  46, -46, -90,  90,  46, -46, -90,  90,  46, -46, -90,  90,  // 80
+ 38, -38,  54, -54,  38, -38,  54, -54,  38, -38,  54, -54,  38, -38,  54, -54,
+-90,  90,  31, -31, -90,  90,  31, -31, -90,  90,  31, -31, -90,  90,  31, -31,
+ 61, -61, -88,  88,  61, -61, -88,  88,  61, -61, -88,  88,  61, -61, -88,  88,
+ 22, -22,  67, -67,  22, -22,  67, -67,  22, -22,  67, -67,  22, -22,  67, -67,
+-85,  85,  13, -13, -85,  85,  13, -13, -85,  85,  13, -13, -85,  85,  13, -13,
+ 73, -73, -82,  82,  73, -73, -82,  82,  73, -73, -82,  82,  73, -73, -82,  82,
+  4,  -4,  78, -78,   4,  -4,  78, -78,   4,  -4,  78, -78,   4,  -4,  78, -78,
+ 38, -38, -88,  88,  38, -38, -88,  88,  38, -38, -88,  88,  38, -38, -88,  88,  // 88
+ 73, -73,  -4,   4,  73, -73,  -4,   4,  73, -73,  -4,   4,  73, -73,  -4,   4,
+-67,  67,  90, -90, -67,  67,  90, -90, -67,  67,  90, -90, -67,  67,  90, -90,
+-46,  46, -31,  31, -46,  46, -31,  31, -46,  46, -31,  31, -46,  46, -31,  31,
+ 85, -85, -78,  78,  85, -85, -78,  78,  85, -85, -78,  78,  85, -85, -78,  78,
+ 13, -13,  61, -61,  13, -13,  61, -61,  13, -13,  61, -61,  13, -13,  61, -61,
+-90,  90,  54, -54, -90,  90,  54, -54, -90,  90,  54, -54, -90,  90,  54, -54,
+ 22, -22, -82,  82,  22, -22, -82,  82,  22, -22, -82,  82,  22, -22, -82,  82,
+ 31, -31, -78,  78,  31, -31, -78,  78,  31, -31, -78,  78,  31, -31, -78,  78,  // 96
+ 90, -90, -61,  61,  90, -90, -61,  61,  90, -90, -61,  61,  90, -90, -61,  61,
+  4,  -4,  54, -54,   4,  -4,  54, -54,   4,  -4,  54, -54,   4,  -4,  54, -54,
+-88,  88,  82, -82, -88,  88,  82, -82, -88,  88,  82, -82, -88,  88,  82, -82,
+-38,  38, -22,  22, -38,  38, -22,  22, -38,  38, -22,  22, -38,  38, -22,  22,
+ 73, -73, -90,  90,  73, -73, -90,  90,  73, -73, -90,  90,  73, -73, -90,  90,
+ 67, -67, -13,  13,  67, -67, -13,  13,  67, -67, -13,  13,  67, -67, -13,  13,
+-46,  46,  85, -85, -46,  46,  85, -85, -46,  46,  85, -85, -46,  46,  85, -85,
+ 22, -22, -61,  61,  22, -22, -61,  61,  22, -22, -61,  61,  22, -22, -61,  61,  // 104
+ 85, -85, -90,  90,  85, -85, -90,  90,  85, -85, -90,  90,  85, -85, -90,  90,
+ 73, -73, -38,  38,  73, -73, -38,  38,  73, -73, -38,  38,  73, -73, -38,  38,
+ -4,   4,  46, -46,  -4,   4,  46, -46,  -4,   4,  46, -46,  -4,   4,  46, -46,
+-78,  78,  90, -90, -78,  78,  90, -90, -78,  78,  90, -90, -78,  78,  90, -90,
+-82,  82,  54, -54, -82,  82,  54, -54, -82,  82,  54, -54, -82,  82,  54, -54,
+-13,  13, -31,  31, -13,  13, -31,  31, -13,  13, -31,  31, -13,  13, -31,  31,
+ 67, -67, -88,  88,  67, -67, -88,  88,  67, -67, -88,  88,  67, -67, -88,  88,
+ 13, -13, -38,  38,  13, -13, -38,  38,  13, -13, -38,  38,  13, -13, -38,  38,  // 112
+ 61, -61, -78,  78,  61, -61, -78,  78,  61, -61, -78,  78,  61, -61, -78,  78,
+ 88, -88, -90,  90,  88, -88, -90,  90,  88, -88, -90,  90,  88, -88, -90,  90,
+ 85, -85, -73,  73,  85, -85, -73,  73,  85, -85, -73,  73,  85, -85, -73,  73,
+ 54, -54, -31,  31,  54, -54, -31,  31,  54, -54, -31,  31,  54, -54, -31,  31,
+  4,  -4,  22, -22,   4,  -4,  22, -22,   4,  -4,  22, -22,   4,  -4,  22, -22,
+-46,  46,  67, -67, -46,  46,  67, -67, -46,  46,  67, -67, -46,  46,  67, -67,
+-82,  82,  90, -90, -82,  82,  90, -90, -82,  82,  90, -90, -82,  82,  90, -90,
+  4,  -4, -13,  13,   4,  -4, -13,  13,   4,  -4, -13,  13,   4,  -4, -13,  13,  // 120
+ 22, -22, -31,  31,  22, -22, -31,  31,  22, -22, -31,  31,  22, -22, -31,  31,
+ 38, -38, -46,  46,  38, -38, -46,  46,  38, -38, -46,  46,  38, -38, -46,  46,
+ 54, -54, -61,  61,  54, -54, -61,  61,  54, -54, -61,  61,  54, -54, -61,  61,
+ 67, -67, -73,  73,  67, -67, -73,  73,  67, -67, -73,  73,  67, -67, -73,  73,
+ 78, -78, -82,  82,  78, -78, -82,  82,  78, -78, -82,  82,  78, -78, -82,  82,
+ 85, -85, -88,  88,  85, -85, -88,  88,  85, -85, -88,  88,  85, -85, -88,  88,
+ 90, -90, -90,  90,  90, -90, -90,  90,  90, -90, -90,  90,  90, -90, -90,  90,
+};
+
+
+ALIGNED(32) const int16_t  ff_dct2_32x4_coeff_ver[128] = {
+ 64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,
+ 83,  36,  83,  36,  83,  36,  83,  36,  83,  36,  83,  36,  83,  36,  83,  36,
+ 64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,
+ 36, -83,  36, -83,  36, -83,  36, -83,  36, -83,  36, -83,  36, -83,  36, -83,
+ 64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,
+-36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83,
+-64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64,
+ 83, -36,  83, -36,  83, -36,  83, -36,  83, -36,  83, -36,  83, -36,  83, -36,
+};
+
+ALIGNED(32) const int16_t  ff_dst7_32x4_coeff_ver[128] = {
+ 29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,
+ 74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,
+ 84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,
+ 55, -84,  55, -84,  55, -84,  55, -84,  55, -84,  55, -84,  55, -84,  55, -84,
+ 74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,
+  0, -74,   0, -74,   0, -74,   0, -74,   0, -74,   0, -74,   0, -74,   0, -74,
+-74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55,
+ 74, -29,  74, -29,  74, -29,  74, -29,  74, -29,  74, -29,  74, -29,  74, -29,
+};
+
+ALIGNED(32) const int16_t  ff_dct8_32x4_coeff_ver[128] = {
+ 84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,
+ 74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,
+ 55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,
+ 29, -74,  29, -74,  29, -74,  29, -74,  29, -74,  29, -74,  29, -74,  29, -74,
+ 55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,
+-74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74,
+-29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84,
+ 84, -55,  84, -55,  84, -55,  84, -55,  84, -55,  84, -55,  84, -55,  84, -55,
+};
+
+
+ALIGNED(32) const int16_t  fi_dct2_32x4_coeff_ver[128] = {
+ 64,  83,  64,  83,  64,  83,  64,  83,  64,  83,  64,  83,  64,  83,  64,  83,
+ 64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,
+ 64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,
+-64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83,
+ 64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,
+-64,  83, -64,  83, -64,  83, -64,  83, -64,  83, -64,  83, -64,  83, -64,  83,
+ 64, -83,  64, -83,  64, -83,  64, -83,  64, -83,  64, -83,  64, -83,  64, -83,
+ 64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,
+};
+
+ALIGNED(32) const int16_t  fi_dst7_32x4_coeff_ver[128] = {
+ 29,  74,  29,  74,  29,  74,  29,  74,  29,  74,  29,  74,  29,  74,  29,  74,
+ 84,  55,  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,
+ 55,  74,  55,  74,  55,  74,  55,  74,  55,  74,  55,  74,  55,  74,  55,  74,
+-29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84,
+ 74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,
+-74,  74, -74,  74, -74,  74, -74,  74, -74,  74, -74,  74, -74,  74, -74,  74,
+ 84, -74,  84, -74,  84, -74,  84, -74,  84, -74,  84, -74,  84, -74,  84, -74,
+ 55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,
+};
+
+ALIGNED(32) const int16_t  fi_dct8_32x4_coeff_ver[128] = {
+ 84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,
+ 55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,
+ 74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,
+-74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74,
+ 55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,
+-29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84,
+ 29, -74,  29, -74,  29, -74,  29, -74,  29, -74,  29, -74,  29, -74,  29, -74,
+ 84, -55,  84, -55,  84, -55,  84, -55,  84, -55,  84, -55,  84, -55,  84, -55,
+};
+
+
+ALIGNED(32) const int16_t  ff_dct2_32x8_coeff_ver[512] = {
+ 64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  // 0
+ 89,  75,  89,  75,  89,  75,  89,  75,  89,  75,  89,  75,  89,  75,  89,  75,
+ 83,  36,  83,  36,  83,  36,  83,  36,  83,  36,  83,  36,  83,  36,  83,  36,
+ 75, -18,  75, -18,  75, -18,  75, -18,  75, -18,  75, -18,  75, -18,  75, -18,
+ 64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,
+ 50, -89,  50, -89,  50, -89,  50, -89,  50, -89,  50, -89,  50, -89,  50, -89,
+ 36, -83,  36, -83,  36, -83,  36, -83,  36, -83,  36, -83,  36, -83,  36, -83,
+ 18, -50,  18, -50,  18, -50,  18, -50,  18, -50,  18, -50,  18, -50,  18, -50,
+ 64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  // 8
+ 50,  18,  50,  18,  50,  18,  50,  18,  50,  18,  50,  18,  50,  18,  50,  18,
+-36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83,
+-89, -50, -89, -50, -89, -50, -89, -50, -89, -50, -89, -50, -89, -50, -89, -50,
+-64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64,
+ 18,  75,  18,  75,  18,  75,  18,  75,  18,  75,  18,  75,  18,  75,  18,  75,
+ 83, -36,  83, -36,  83, -36,  83, -36,  83, -36,  83, -36,  83, -36,  83, -36,
+ 75, -89,  75, -89,  75, -89,  75, -89,  75, -89,  75, -89,  75, -89,  75, -89,
+ 64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  // 16
+-18, -50, -18, -50, -18, -50, -18, -50, -18, -50, -18, -50, -18, -50, -18, -50,
+-83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36,
+ 50,  89,  50,  89,  50,  89,  50,  89,  50,  89,  50,  89,  50,  89,  50,  89,
+ 64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,
+-75, -18, -75, -18, -75, -18, -75, -18, -75, -18, -75, -18, -75, -18, -75, -18,
+-36,  83, -36,  83, -36,  83, -36,  83, -36,  83, -36,  83, -36,  83, -36,  83,
+ 89, -75,  89, -75,  89, -75,  89, -75,  89, -75,  89, -75,  89, -75,  89, -75,
+ 64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  // 24
+-75, -89, -75, -89, -75, -89, -75, -89, -75, -89, -75, -89, -75, -89, -75, -89,
+ 36,  83,  36,  83,  36,  83,  36,  83,  36,  83,  36,  83,  36,  83,  36,  83,
+ 18, -75,  18, -75,  18, -75,  18, -75,  18, -75,  18, -75,  18, -75,  18, -75,
+-64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64,
+ 89, -50,  89, -50,  89, -50,  89, -50,  89, -50,  89, -50,  89, -50,  89, -50,
+-83,  36, -83,  36, -83,  36, -83,  36, -83,  36, -83,  36, -83,  36, -83,  36,
+ 50, -18,  50, -18,  50, -18,  50, -18,  50, -18,  50, -18,  50, -18,  50, -18,
+};
+
+ALIGNED(32) const int16_t  ff_dst7_32x8_coeff_ver[512] = {
+ 17,  32,  17,  32,  17,  32,  17,  32,  17,  32,  17,  32,  17,  32,  17,  32,  // 0
+ 46,  78,  46,  78,  46,  78,  46,  78,  46,  78,  46,  78,  46,  78,  46,  78,
+ 71,  85,  71,  85,  71,  85,  71,  85,  71,  85,  71,  85,  71,  85,  71,  85,
+ 85,  46,  85,  46,  85,  46,  85,  46,  85,  46,  85,  46,  85,  46,  85,  46,
+ 86, -17,  86, -17,  86, -17,  86, -17,  86, -17,  86, -17,  86, -17,  86, -17,
+ 78, -71,  78, -71,  78, -71,  78, -71,  78, -71,  78, -71,  78, -71,  78, -71,
+ 60, -86,  60, -86,  60, -86,  60, -86,  60, -86,  60, -86,  60, -86,  60, -86,
+ 32, -60,  32, -60,  32, -60,  32, -60,  32, -60,  32, -60,  32, -60,  32, -60,
+ 46,  60,  46,  60,  46,  60,  46,  60,  46,  60,  46,  60,  46,  60,  46,  60,  // 8
+ 86,  71,  86,  71,  86,  71,  86,  71,  86,  71,  86,  71,  86,  71,  86,  71,
+ 32, -46,  32, -46,  32, -46,  32, -46,  32, -46,  32, -46,  32, -46,  32, -46,
+-60, -78, -60, -78, -60, -78, -60, -78, -60, -78, -60, -78, -60, -78, -60, -78,
+-85,  32, -85,  32, -85,  32, -85,  32, -85,  32, -85,  32, -85,  32, -85,  32,
+-17,  85, -17,  85, -17,  85, -17,  85, -17,  85, -17,  85, -17,  85, -17,  85,
+ 71, -17,  71, -17,  71, -17,  71, -17,  71, -17,  71, -17,  71, -17,  71, -17,
+ 78, -86,  78, -86,  78, -86,  78, -86,  78, -86,  78, -86,  78, -86,  78, -86,
+ 71,  78,  71,  78,  71,  78,  71,  78,  71,  78,  71,  78,  71,  78,  71,  78,  // 16
+ 32, -17,  32, -17,  32, -17,  32, -17,  32, -17,  32, -17,  32, -17,  32, -17,
+-86, -60, -86, -60, -86, -60, -86, -60, -86, -60, -86, -60, -86, -60, -86, -60,
+ 17,  86,  17,  86,  17,  86,  17,  86,  17,  86,  17,  86,  17,  86,  17,  86,
+ 78, -46,  78, -46,  78, -46,  78, -46,  78, -46,  78, -46,  78, -46,  78, -46,
+-60, -32, -60, -32, -60, -32, -60, -32, -60, -32, -60, -32, -60, -32, -60, -32,
+-46,  85, -46,  85, -46,  85, -46,  85, -46,  85, -46,  85, -46,  85, -46,  85,
+ 85, -71,  85, -71,  85, -71,  85, -71,  85, -71,  85, -71,  85, -71,  85, -71,
+ 85,  86,  85,  86,  85,  86,  85,  86,  85,  86,  85,  86,  85,  86,  85,  86,  // 24
+-60, -85, -60, -85, -60, -85, -60, -85, -60, -85, -60, -85, -60, -85, -60, -85,
+ 17,  78,  17,  78,  17,  78,  17,  78,  17,  78,  17,  78,  17,  78,  17,  78,
+ 32, -71,  32, -71,  32, -71,  32, -71,  32, -71,  32, -71,  32, -71,  32, -71,
+-71,  60, -71,  60, -71,  60, -71,  60, -71,  60, -71,  60, -71,  60, -71,  60,
+ 86, -46,  86, -46,  86, -46,  86, -46,  86, -46,  86, -46,  86, -46,  86, -46,
+-78,  32, -78,  32, -78,  32, -78,  32, -78,  32, -78,  32, -78,  32, -78,  32,
+ 46, -17,  46, -17,  46, -17,  46, -17,  46, -17,  46, -17,  46, -17,  46, -17,
+};
+
+ALIGNED(32) const int16_t  ff_dct8_32x8_coeff_ver[512] = {
+ 86,  85,  86,  85,  86,  85,  86,  85,  86,  85,  86,  85,  86,  85,  86,  85,  // 0
+ 85,  60,  85,  60,  85,  60,  85,  60,  85,  60,  85,  60,  85,  60,  85,  60,
+ 78,  17,  78,  17,  78,  17,  78,  17,  78,  17,  78,  17,  78,  17,  78,  17,
+ 71, -32,  71, -32,  71, -32,  71, -32,  71, -32,  71, -32,  71, -32,  71, -32,
+ 60, -71,  60, -71,  60, -71,  60, -71,  60, -71,  60, -71,  60, -71,  60, -71,
+ 46, -86,  46, -86,  46, -86,  46, -86,  46, -86,  46, -86,  46, -86,  46, -86,
+ 32, -78,  32, -78,  32, -78,  32, -78,  32, -78,  32, -78,  32, -78,  32, -78,
+ 17, -46,  17, -46,  17, -46,  17, -46,  17, -46,  17, -46,  17, -46,  17, -46,
+ 78,  71,  78,  71,  78,  71,  78,  71,  78,  71,  78,  71,  78,  71,  78,  71,  // 8
+ 17, -32,  17, -32,  17, -32,  17, -32,  17, -32,  17, -32,  17, -32,  17, -32,
+-60, -86, -60, -86, -60, -86, -60, -86, -60, -86, -60, -86, -60, -86, -60, -86,
+-86, -17, -86, -17, -86, -17, -86, -17, -86, -17, -86, -17, -86, -17, -86, -17,
+-46,  78, -46,  78, -46,  78, -46,  78, -46,  78, -46,  78, -46,  78, -46,  78,
+ 32,  60,  32,  60,  32,  60,  32,  60,  32,  60,  32,  60,  32,  60,  32,  60,
+ 85, -46,  85, -46,  85, -46,  85, -46,  85, -46,  85, -46,  85, -46,  85, -46,
+ 71, -85,  71, -85,  71, -85,  71, -85,  71, -85,  71, -85,  71, -85,  71, -85,
+ 60,  46,  60,  46,  60,  46,  60,  46,  60,  46,  60,  46,  60,  46,  60,  46,  // 16
+-71, -86, -71, -86, -71, -86, -71, -86, -71, -86, -71, -86, -71, -86, -71, -86,
+-46,  32, -46,  32, -46,  32, -46,  32, -46,  32, -46,  32, -46,  32, -46,  32,
+ 78,  60,  78,  60,  78,  60,  78,  60,  78,  60,  78,  60,  78,  60,  78,  60,
+ 32, -85,  32, -85,  32, -85,  32, -85,  32, -85,  32, -85,  32, -85,  32, -85,
+-85,  17, -85,  17, -85,  17, -85,  17, -85,  17, -85,  17, -85,  17, -85,  17,
+-17,  71, -17,  71, -17,  71, -17,  71, -17,  71, -17,  71, -17,  71, -17,  71,
+ 86, -78,  86, -78,  86, -78,  86, -78,  86, -78,  86, -78,  86, -78,  86, -78,
+ 32,  17,  32,  17,  32,  17,  32,  17,  32,  17,  32,  17,  32,  17,  32,  17,  // 24
+-78, -46, -78, -46, -78, -46, -78, -46, -78, -46, -78, -46, -78, -46, -78, -46,
+ 85,  71,  85,  71,  85,  71,  85,  71,  85,  71,  85,  71,  85,  71,  85,  71,
+-46, -85, -46, -85, -46, -85, -46, -85, -46, -85, -46, -85, -46, -85, -46, -85,
+-17,  86, -17,  86, -17,  86, -17,  86, -17,  86, -17,  86, -17,  86, -17,  86,
+ 71, -78,  71, -78,  71, -78,  71, -78,  71, -78,  71, -78,  71, -78,  71, -78,
+-86,  60, -86,  60, -86,  60, -86,  60, -86,  60, -86,  60, -86,  60, -86,  60,
+ 60, -32,  60, -32,  60, -32,  60, -32,  60, -32,  60, -32,  60, -32,  60, -32,
+};
+
+
+ALIGNED(32) const int16_t  fi_dct2_32x8_coeff_ver[256] = {
+ 64,  89,  83,  75,  64,  89,  83,  75,  64,  89,  83,  75,  64,  89,  83,  75,  // 0
+ 64,  50,  36,  18,  64,  50,  36,  18,  64,  50,  36,  18,  64,  50,  36,  18,
+ 64,  75,  36, -18,  64,  75,  36, -18,  64,  75,  36, -18,  64,  75,  36, -18,
+-64, -89, -83, -50, -64, -89, -83, -50, -64, -89, -83, -50, -64, -89, -83, -50,
+ 64,  50, -36, -89,  64,  50, -36, -89,  64,  50, -36, -89,  64,  50, -36, -89,
+-64,  18,  83,  75, -64,  18,  83,  75, -64,  18,  83,  75, -64,  18,  83,  75,
+ 64,  18, -83, -50,  64,  18, -83, -50,  64,  18, -83, -50,  64,  18, -83, -50,
+ 64,  75, -36, -89,  64,  75, -36, -89,  64,  75, -36, -89,  64,  75, -36, -89,
+ 64, -18, -83,  50,  64, -18, -83,  50,  64, -18, -83,  50,  64, -18, -83,  50,  // 8
+ 64, -75, -36,  89,  64, -75, -36,  89,  64, -75, -36,  89,  64, -75, -36,  89,
+ 64, -50, -36,  89,  64, -50, -36,  89,  64, -50, -36,  89,  64, -50, -36,  89,
+-64, -18,  83, -75, -64, -18,  83, -75, -64, -18,  83, -75, -64, -18,  83, -75,
+ 64, -75,  36,  18,  64, -75,  36,  18,  64, -75,  36,  18,  64, -75,  36,  18,
+-64,  89, -83,  50, -64,  89, -83,  50, -64,  89, -83,  50, -64,  89, -83,  50,
+ 64, -89,  83, -75,  64, -89,  83, -75,  64, -89,  83, -75,  64, -89,  83, -75,
+ 64, -50,  36, -18,  64, -50,  36, -18,  64, -50,  36, -18,  64, -50,  36, -18,
+};
+
+ALIGNED(32) const int16_t  fi_dst7_32x8_coeff_ver[256] = {
+ 17,  46,  71,  85,  17,  46,  71,  85,  17,  46,  71,  85,  17,  46,  71,  85,  // 0
+ 86,  78,  60,  32,  86,  78,  60,  32,  86,  78,  60,  32,  86,  78,  60,  32,
+ 32,  78,  85,  46,  32,  78,  85,  46,  32,  78,  85,  46,  32,  78,  85,  46,
+-17, -71, -86, -60, -17, -71, -86, -60, -17, -71, -86, -60, -17, -71, -86, -60,
+ 46,  86,  32, -60,  46,  86,  32, -60,  46,  86,  32, -60,  46,  86,  32, -60,
+-85, -17,  71,  78, -85, -17,  71,  78, -85, -17,  71,  78, -85, -17,  71,  78,
+ 60,  71, -46, -78,  60,  71, -46, -78,  60,  71, -46, -78,  60,  71, -46, -78,
+ 32,  85, -17, -86,  32,  85, -17, -86,  32,  85, -17, -86,  32,  85, -17, -86,
+ 71,  32, -86,  17,  71,  32, -86,  17,  71,  32, -86,  17,  71,  32, -86,  17,  // 8
+ 78, -60, -46,  85,  78, -60, -46,  85,  78, -60, -46,  85,  78, -60, -46,  85,
+ 78, -17, -60,  86,  78, -17, -60,  86,  78, -17, -60,  86,  78, -17, -60,  86,
+-46, -32,  85, -71, -46, -32,  85, -71, -46, -32,  85, -71, -46, -32,  85, -71,
+ 85, -60,  17,  32,  85, -60,  17,  32,  85, -60,  17,  32,  85, -60,  17,  32,
+-71,  86, -78,  46, -71,  86, -78,  46, -71,  86, -78,  46, -71,  86, -78,  46,
+ 86, -85,  78, -71,  86, -85,  78, -71,  86, -85,  78, -71,  86, -85,  78, -71,
+ 60, -46,  32, -17,  60, -46,  32, -17,  60, -46,  32, -17,  60, -46,  32, -17,
+};
+
+ALIGNED(32) const int16_t  fi_dct8_32x8_coeff_ver[256] = {
+ 86,  85,  78,  71,  86,  85,  78,  71,  86,  85,  78,  71,  86,  85,  78,  71,  // 0
+ 60,  46,  32,  17,  60,  46,  32,  17,  60,  46,  32,  17,  60,  46,  32,  17,
+ 85,  60,  17, -32,  85,  60,  17, -32,  85,  60,  17, -32,  85,  60,  17, -32,
+-71, -86, -78, -46, -71, -86, -78, -46, -71, -86, -78, -46, -71, -86, -78, -46,
+ 78,  17, -60, -86,  78,  17, -60, -86,  78,  17, -60, -86,  78,  17, -60, -86,
+-46,  32,  85,  71, -46,  32,  85,  71, -46,  32,  85,  71, -46,  32,  85,  71,
+ 71, -32, -86, -17,  71, -32, -86, -17,  71, -32, -86, -17,  71, -32, -86, -17,
+ 78,  60, -46, -85,  78,  60, -46, -85,  78,  60, -46, -85,  78,  60, -46, -85,
+ 60, -71, -46,  78,  60, -71, -46,  78,  60, -71, -46,  78,  60, -71, -46,  78,  // 8
+ 32, -85, -17,  86,  32, -85, -17,  86,  32, -85, -17,  86,  32, -85, -17,  86,
+ 46, -86,  32,  60,  46, -86,  32,  60,  46, -86,  32,  60,  46, -86,  32,  60,
+-85,  17,  71, -78, -85,  17,  71, -78, -85,  17,  71, -78, -85,  17,  71, -78,
+ 32, -78,  85, -46,  32, -78,  85, -46,  32, -78,  85, -46,  32, -78,  85, -46,
+-17,  71, -86,  60, -17,  71, -86,  60, -17,  71, -86,  60, -17,  71, -86,  60,
+ 17, -46,  71, -85,  17, -46,  71, -85,  17, -46,  71, -85,  17, -46,  71, -85,
+ 86, -78,  60, -32,  86, -78,  60, -32,  86, -78,  60, -32,  86, -78,  60, -32,
+};
+
+
+ALIGNED(32) const int16_t  ff_dct2_32x16_coeff_ver[256] = {
+ 64,  64,  90,  87,  89,  75,  87,  57,  83,  36,  80,   9,  75, -18,  70, -43,  // 0
+ 64, -64,  57, -80,  50, -89,  43, -90,  36, -83,  25, -70,  18, -50,   9, -25,
+ 64,  64,  80,  70,  50,  18,   9, -43, -36, -83, -70, -87, -89, -50, -87,   9,
+-64,  64, -25,  90,  18,  75,  57,  25,  83, -36,  90, -80,  75, -89,  43, -57,
+ 64,  64,  57,  43, -18, -50, -80, -90, -83, -36, -25,  57,  50,  89,  90,  25,
+ 64, -64,  -9, -87, -75, -18, -87,  70, -36,  83,  43,   9,  89, -75,  70, -80,
+ 64,  64,  25,   9, -75, -89, -70, -25,  36,  83,  90,  43,  18, -75, -80, -57,
+-64,  64,  43,  70,  89, -50,   9, -80, -83,  36, -57,  87,  50, -18,  87, -90,
+ 64,  64,  -9, -25, -89, -75,  25,  70,  83,  36, -43, -90, -75,  18,  57,  80,  // 8
+ 64, -64, -70, -43, -50,  89,  80,  -9,  36, -83, -87,  57, -18,  50,  90, -87,
+ 64,  64, -43, -57, -50, -18,  90,  80, -36, -83, -57,  25,  89,  50, -25, -90,
+-64,  64,  87,   9, -18, -75, -70,  87,  83, -36,  -9, -43, -75,  89,  80, -70,
+ 64,  64, -70, -80,  18,  50,  43,  -9, -83, -36,  87,  70, -50, -89,  -9,  87,
+ 64, -64, -90,  25,  75,  18, -25, -57, -36,  83,  80, -90, -89,  75,  57, -43,
+ 64,  64, -87, -90,  75,  89, -57, -87,  36,  83,  -9, -80, -18,  75,  43, -70,
+-64,  64,  80, -57, -89,  50,  90, -43, -83,  36,  70, -25, -50,  18,  25,  -9,
+};
+
+ALIGNED(32) const int16_t  ff_dst7_32x16_coeff_ver[256] = {
+  8,  17,  25,  48,  40,  73,  55,  87,  68,  88,  77,  77,  85,  55,  88,  25,  // 0
+ 88,  -8,  87, -40,  81, -68,  73, -85,  62, -88,  48, -81,  33, -62,  17, -33,
+ 25,  33,  68,  81,  88,  85,  81,  40,  48, -25,   0, -77, -48, -87, -81, -48,
+-88,  17, -68,  73, -25,  88,  25,  55,  68,  -8,  88, -68,  81, -88,  48, -62,
+ 40,  48,  88,  88,  62,  25, -17, -68, -81, -81, -77,   0,  -8,  81,  68,  68,
+ 87, -25,  33, -88, -48, -48, -88,  48, -55,  88,  25,  25,  85, -68,  73, -81,
+ 55,  62,  81,  68, -17, -55, -88, -73, -25,  48,  77,  77,  62, -40, -48, -81,
+-85,  33,   8,  85,  88, -25,  33, -87, -73,  17, -68,  88,  40,  -8,  87, -88,
+ 68,  73,  48,  25, -81, -88, -25,  33,  88,  68,   0, -77, -88, -17,  25,  88,  // 8
+ 81, -40, -48, -62, -68,  81,  68,   8,  48, -87, -81,  48, -25,  55,  88, -85,
+ 77,  81,   0, -25, -77, -48,  77,  88,   0, -68, -77,   0,  77,  68,   0, -88,
+-77,  48,  77,  25,   0, -81, -77,  81,  77, -25,   0, -48, -77,  88,  77, -68,
+ 85,  87, -48, -68,  -8,  33,  62,   8, -88, -48,  77,  77, -33, -88, -25,  81,
+ 73, -55, -88,  17,  68,  25, -17, -62, -40,  85,  81, -88, -87,  73,  55, -40,
+ 88,  88, -81, -88,  68,  87, -48, -85,  25,  81,   0, -77, -25,  73,  48, -68,
+-68,  62,  81, -55, -88,  48,  88, -40, -81,  33,  68, -25, -48,  17,  25,  -8,
+};
+
+ALIGNED(32) const int16_t  ff_dct8_32x16_coeff_ver[256] = {
+ 88,  88,  88,  81,  87,  68,  85,  48,  81,  25,  77,   0,  73, -25,  68, -48,  // 0
+ 62, -68,  55, -81,  48, -88,  40, -88,  33, -81,  25, -68,  17, -48,   8, -25,
+ 87,  85,  68,  48,  33,  -8,  -8, -62, -48, -88, -77, -77, -88, -33, -81,  25,
+-55,  73, -17,  88,  25,  68,  62,  17,  85, -40,  88, -81,  73, -87,  40, -55,
+ 81,  77,  25,   0, -48, -77, -88, -77, -68,   0,   0,  77,  68,  77,  88,   0,
+ 48, -77, -25, -77, -81,   0, -81,  77, -25,  77,  48,   0,  88, -77,  68, -77,
+ 73,  68, -25, -48, -88, -81, -33,  25,  68,  88,  77,   0, -17, -88, -88, -25,
+-40,  81,  62,  48,  81, -68,  -8, -68, -87,  48, -48,  81,  55, -25,  85, -88,
+ 62,  55, -68, -81, -55, -17,  73,  88,  48, -25, -77, -77, -40,  62,  81,  48,  // 8
+ 33, -85, -85,  -8, -25,  88,  87, -33,  17, -73, -88,  68,  -8,  40,  88, -87,
+ 48,  40, -88, -88,  25,  62,  68,  17, -81, -81,   0,  77,  81,  -8, -68, -68,
+-25,  87,  88, -33, -48, -48, -48,  88,  88, -55, -25, -25, -68,  85,  81, -73,
+ 33,  25, -81, -68,  85,  88, -40, -81, -25,  48,  77,   0, -87, -48,  48,  81,
+ 17, -88, -73,  68,  88, -25, -55, -25,  -8,  68,  68, -88, -88,  81,  62, -48,
+ 17,   8, -48, -25,  73,  40, -87, -55,  88,  68, -77, -77,  55,  85, -25, -88,
+ -8,  88,  40, -87, -68,  81,  85, -73, -88,  62,  81, -48, -62,  33,  33, -17,
+};
+
+
+ALIGNED(32) const int16_t  fi_dct2_32x16_coeff_ver[256] = {
+ 64,  90,  64,  87,  64,  80,  64,  70,  64,  57,  64,  43,  64,  25,  64,   9,  // 0
+ 64,  -9,  64, -25,  64, -43,  64, -57,  64, -70,  64, -80,  64, -87,  64, -90,
+ 89,  87,  75,  57,  50,   9,  18, -43, -18, -80, -50, -90, -75, -70, -89, -25,
+-89,  25, -75,  70, -50,  90, -18,  80,  18,  43,  50,  -9,  75, -57,  89, -87,
+ 83,  80,  36,   9, -36, -70, -83, -87, -83, -25, -36,  57,  36,  90,  83,  43,
+ 83, -43,  36, -90, -36, -57, -83,  25, -83,  87, -36,  70,  36,  -9,  83, -80,
+ 75,  70, -18, -43, -89, -87, -50,   9,  50,  90,  89,  25,  18, -80, -75, -57,
+-75,  57,  18,  80,  89, -25,  50, -90, -50,  -9, -89,  87, -18,  43,  75, -70,
+ 64,  57, -64, -80, -64, -25,  64,  90,  64,  -9, -64, -87, -64,  43,  64,  70,  // 8
+ 64, -70, -64, -43, -64,  87,  64,   9,  64, -90, -64,  25, -64,  80,  64, -57,
+ 50,  43, -89, -90,  18,  57,  75,  25, -75, -87, -18,  70,  89,   9, -50, -80,
+-50,  80,  89,  -9, -18, -70, -75,  87,  75, -25,  18, -57, -89,  90,  50, -43,
+ 36,  25, -83, -70,  83,  90, -36, -80, -36,  43,  83,   9, -83, -57,  36,  87,
+ 36, -87, -83,  57,  83,  -9, -36, -43, -36,  80,  83, -90, -83,  70,  36, -25,
+ 18,   9, -50, -25,  75,  43, -89, -57,  89,  70, -75, -80,  50,  87, -18, -90,
+-18,  90,  50, -87, -75,  80,  89, -70, -89,  57,  75, -43, -50,  25,  18,  -9,
+};
+
+ALIGNED(32) const int16_t  fi_dst7_32x16_coeff_ver[256] = {
+  8,  25,  17,  48,  25,  68,  33,  81,  40,  88,  48,  88,  55,  81,  62,  68,  // 0
+ 68,  48,  73,  25,  77,   0,  81, -25,  85, -48,  87, -68,  88, -81,  88, -88,
+ 40,  55,  73,  87,  88,  81,  85,  40,  62, -17,  25, -68, -17, -88, -55, -73,
+-81, -25, -88,  33, -77,  77, -48,  88,  -8,  62,  33,   8,  68, -48,  87, -85,
+ 68,  77,  88,  77,  48,   0, -25, -77, -81, -77, -81,   0, -25,  77,  48,  77,
+ 88,   0,  68, -77,   0, -77, -68,   0, -88,  77, -48,  77,  25,   0,  81, -77,
+ 85,  88,  55,  25, -48, -81, -87, -48,  -8,  68,  81,  68,  62, -48, -40, -81,
+-88,  25, -17,  88,  77,   0,  68, -88, -33, -25, -88,  81, -25,  48,  73, -68,
+ 88,  87,  -8, -40, -88, -68,  17,  73,  87,  33, -25, -88, -85,   8,  33,  85,  // 8
+ 81, -48, -40, -62, -77,  77,  48,  25,  73, -88, -55,  17, -68,  81,  62, -55,
+ 81,  73, -68, -85, -25,  25,  88,  55, -48, -88, -48,  48,  88,  33, -25, -87,
+-68,  68,  81,   8,   0, -77, -81,  81,  68, -17,  25, -62, -88,  88,  48, -40,
+ 62,  48, -88, -81,  68,  88,  -8, -68, -55,  25,  88,  25, -73, -68,  17,  88,
+ 48, -81, -87,  48,  77,   0, -25, -48, -40,  81,  85, -88, -81,  68,  33, -25,
+ 33,  17, -62, -33,  81,  48, -88, -62,  85,  73, -68, -81,  40,  87,  -8, -88,
+-25,  88,  55, -85, -77,  77,  88, -68, -87,  55,  73, -40, -48,  25,  17,  -8,
+};
+
+ALIGNED(32) const int16_t  fi_dct8_32x16_coeff_ver[256] = {
+ 88,  88,  88,  81,  87,  68,  85,  48,  81,  25,  77,   0,  73, -25,  68, -48,  // 0
+ 62, -68,  55, -81,  48, -88,  40, -88,  33, -81,  25, -68,  17, -48,   8, -25,
+ 87,  85,  68,  48,  33,  -8,  -8, -62, -48, -88, -77, -77, -88, -33, -81,  25,
+-55,  73, -17,  88,  25,  68,  62,  17,  85, -40,  88, -81,  73, -87,  40, -55,
+ 81,  77,  25,   0, -48, -77, -88, -77, -68,   0,   0,  77,  68,  77,  88,   0,
+ 48, -77, -25, -77, -81,   0, -81,  77, -25,  77,  48,   0,  88, -77,  68, -77,
+ 73,  68, -25, -48, -88, -81, -33,  25,  68,  88,  77,   0, -17, -88, -88, -25,
+-40,  81,  62,  48,  81, -68,  -8, -68, -87,  48, -48,  81,  55, -25,  85, -88,
+ 62,  55, -68, -81, -55, -17,  73,  88,  48, -25, -77, -77, -40,  62,  81,  48,  // 8
+ 33, -85, -85,  -8, -25,  88,  87, -33,  17, -73, -88,  68,  -8,  40,  88, -87,
+ 48,  40, -88, -88,  25,  62,  68,  17, -81, -81,   0,  77,  81,  -8, -68, -68,
+-25,  87,  88, -33, -48, -48, -48,  88,  88, -55, -25, -25, -68,  85,  81, -73,
+ 33,  25, -81, -68,  85,  88, -40, -81, -25,  48,  77,   0, -87, -48,  48,  81,
+ 17, -88, -73,  68,  88, -25, -55, -25,  -8,  68,  68, -88, -88,  81,  62, -48,
+ 17,   8, -48, -25,  73,  40, -87, -55,  88,  68, -77, -77,  55,  85, -25, -88,
+ -8,  88,  40, -87, -68,  81,  85, -73, -88,  62,  81, -48, -62,  33,  33, -17,
+};
+
+
+ALIGNED(32) const int16_t  ff_dct2_32x32_coeff_ver[1024] = {
+ 64,  64,  90,  90,  90,  87,  90,  82,  89,  75,  88,  67,  87,  57,  85,  46,  // 0
+ 83,  36,  82,  22,  80,   9,  78,  -4,  75, -18,  73, -31,  70, -43,  67, -54,
+ 64, -64,  61, -73,  57, -80,  54, -85,  50, -89,  46, -90,  43, -90,  38, -88,
+ 36, -83,  31, -78,  25, -70,  22, -61,  18, -50,  13, -38,   9, -25,   4, -13,
+ 64,  64,  88,  85,  80,  70,  67,  46,  50,  18,  31, -13,   9, -43, -13, -67,
+-36, -83, -54, -90, -70, -87, -82, -73, -89, -50, -90, -22, -87,   9, -78,  38,
+-64,  64, -46,  82, -25,  90,  -4,  88,  18,  75,  38,  54,  57,  25,  73,  -4,
+ 83, -36,  90, -61,  90, -80,  85, -90,  75, -89,  61, -78,  43, -57,  22, -31,
+ 64,  64,  82,  78,  57,  43,  22,  -4, -18, -50, -54, -82, -80, -90, -90, -73,  // 8
+-83, -36, -61,  13, -25,  57,  13,  85,  50,  89,  78,  67,  90,  25,  85, -22,
+ 64, -64,  31, -88,  -9, -87, -46, -61, -75, -18, -90,  31, -87,  70, -67,  90,
+-36,  83,   4,  54,  43,   9,  73, -38,  89, -75,  88, -90,  70, -80,  38, -46,
+ 64,  64,  73,  67,  25,   9, -31, -54, -75, -89, -90, -78, -70, -25, -22,  38,
+ 36,  83,  78,  85,  90,  43,  67, -22,  18, -75, -38, -90, -80, -57, -90,   4,
+-64,  64, -13,  90,  43,  70,  82,  13,  89, -50,  61, -88,   9, -80, -46, -31,
+-83,  36, -88,  82, -57,  87,  -4,  46,  50, -18,  85, -73,  87, -90,  54, -61,
+ 64,  64,  61,  54,  -9, -25, -73, -85, -89, -75, -46,  -4,  25,  70,  82,  88,  // 16
+ 83,  36,  31, -46, -43, -90, -88, -61, -75,  18, -13,  82,  57,  80,  90,  13,
+ 64, -64,  -4, -90, -70, -43, -90,  38, -50,  89,  22,  67,  80,  -9,  85, -78,
+ 36, -83, -38, -22, -87,  57, -78,  90, -18,  50,  54, -31,  90, -87,  67, -73,
+ 64,  64,  46,  38, -43, -57, -90, -88, -50, -18,  38,  73,  90,  80,  54,  -4,
+-36, -83, -90, -67, -57,  25,  31,  90,  89,  50,  61, -46, -25, -90, -88, -31,
+-64,  64,  22,  85,  87,   9,  67, -78, -18, -75, -85,  13, -70,  87,  13,  61,
+ 83, -36,  73, -90,  -9, -43, -82,  54, -75,  89,   4,  22,  80, -70,  78, -82,
+ 64,  64,  31,  22, -70, -80, -78, -61,  18,  50,  90,  85,  43,  -9, -61, -90,  // 24
+-83, -36,   4,  73,  87,  70,  54, -38, -50, -89, -88,  -4,  -9,  87,  82,  46,
+ 64, -64, -38, -78, -90,  25, -22,  90,  75,  18,  73, -82, -25, -57, -90,  54,
+-36,  83,  67, -13,  80, -90, -13, -31, -89,  75, -46,  67,  57, -43,  85, -88,
+ 64,  64,  13,   4, -87, -90, -38, -13,  75,  89,  61,  22, -57, -87, -78, -31,
+ 36,  83,  88,  38,  -9, -80, -90, -46, -18,  75,  85,  54,  43, -70, -73, -61,
+-64,  64,  54,  67,  80, -57, -31, -73, -89,  50,   4,  78,  90, -43,  22, -82,
+-83,  36, -46,  85,  70, -25,  67, -88, -50,  18, -82,  90,  25,  -9,  90, -90,
+ 64,  64,  -4, -13, -90, -87,  13,  38,  89,  75, -22, -61, -87, -57,  31,  78,  // 32
+ 83,  36, -38, -88, -80,  -9,  46,  90,  75, -18, -54, -85, -70,  43,  61,  73,
+ 64, -64, -67, -54, -57,  80,  73,  31,  50, -89, -78,  -4, -43,  90,  82, -22,
+ 36, -83, -85,  46, -25,  70,  88, -67,  18, -50, -90,  82,  -9,  25,  90, -90,
+ 64,  64, -22, -31, -80, -70,  61,  78,  50,  18, -85, -90,  -9,  43,  90,  61,
+-36, -83, -73,  -4,  70,  87,  38, -54, -89, -50,   4,  88,  87,  -9, -46, -82,
+-64,  64,  78,  38,  25, -90, -90,  22,  18,  75,  82, -73, -57, -25, -54,  90,
+ 83, -36,  13, -67, -90,  80,  31,  13,  75, -89, -67,  46, -43,  57,  88, -85,
+ 64,  64, -38, -46, -57, -43,  88,  90, -18, -50, -73, -38,  80,  90,   4, -54,  // 40
+-83, -36,  67,  90,  25, -57, -90, -31,  50,  89,  46, -61, -90, -25,  31,  88,
+ 64, -64, -85, -22,   9,  87,  78, -67, -75, -18, -13,  85,  87, -70, -61, -13,
+-36,  83,  90, -73, -43,  -9, -54,  82,  89, -75, -22,  -4, -70,  80,  82, -78,
+ 64,  64, -54, -61, -25,  -9,  85,  73, -75, -89,   4,  46,  70,  25, -88, -82,
+ 36,  83,  46, -31, -90, -43,  61,  88,  18, -75, -82,  13,  80,  57, -13, -90,
+-64,  64,  90,   4, -43, -70, -38,  90,  89, -50, -67, -22,  -9,  80,  78, -85,
+-83,  36,  22,  38,  57, -87, -90,  78,  50, -18,  31, -54, -87,  90,  73, -67,
+ 64,  64, -67, -73,   9,  25,  54,  31, -89, -75,  78,  90, -25, -70, -38,  22,  // 48
+ 83,  36, -85, -78,  43,  90,  22, -67, -75,  18,  90,  38, -57, -80,  -4,  90,
+ 64, -64, -90,  13,  70,  43, -13, -82, -50,  89,  88, -61, -80,   9,  31,  46,
+ 36, -83, -82,  88,  87, -57, -46,   4, -18,  50,  73, -85, -90,  87,  61, -54,
+ 64,  64, -78, -82,  43,  57,   4, -22, -50, -18,  82,  54, -90, -80,  73,  90,
+-36, -83, -13,  61,  57, -25, -85, -13,  89,  50, -67, -78,  25,  90,  22, -85,
+-64,  64,  88, -31, -87,  -9,  61,  46, -18, -75, -31,  90,  70, -87, -90,  67,
+ 83, -36, -54,  -4,   9,  43,  38, -73, -75,  89,  90, -88, -80,  70,  46, -38,
+ 64,  64, -85, -88,  70,  80, -46, -67,  18,  50,  13, -31, -43,   9,  67,  13,  // 56
+-83, -36,  90,  54, -87, -70,  73,  82, -50, -89,  22,  90,   9, -87, -38,  78,
+ 64, -64, -82,  46,  90, -25, -88,   4,  75,  18, -54, -38,  25,  57,   4, -73,
+-36,  83,  61, -90, -80,  90,  90, -85, -89,  75,  78, -61, -57,  43,  31, -22,
+ 64,  64, -90, -90,  87,  90, -82, -90,  75,  89, -67, -88,  57,  87, -46, -85,
+ 36,  83, -22, -82,   9,  80,   4, -78, -18,  75,  31, -73, -43,  70,  54, -67,
+-64,  64,  73, -61, -80,  57,  85, -54, -89,  50,  90, -46, -90,  43,  88, -38,
+-83,  36,  78, -31, -70,  25,  61, -22, -50,  18,  38, -13, -25,   9,  13,  -4,
+};
+
+ALIGNED(32) const int16_t  ff_dst7_32x32_coeff_ver[1024] = {
+  4,   9,  13,  26,  21,  42,  30,  56,  38,  68,  46,  78,  53,  85,  60,  89,  // 0
+ 66,  90,  72,  86,  77,  80,  80,  72,  84,  60,  86,  46,  88,  30,  90,  13,
+ 90,  -4,  89, -21,  87, -38,  85, -53,  82, -66,  78, -77,  74, -84,  68, -88,
+ 63, -90,  56, -87,  50, -82,  42, -74,  34, -63,  26, -50,  17, -34,   9, -17,
+ 13,  17,  38,  50,  60,  74,  77,  87,  86,  88,  90,  77,  85,  53,  74,  21,
+ 56, -13,  34, -46,   9, -72, -17, -86, -42, -89, -63, -78, -78, -56, -87, -26,
+-90,   9, -84,  42, -72,  68, -53,  85, -30,  90,  -4,  80,  21,  60,  46,  30,
+ 66,  -4,  80, -38,  88, -66,  89, -84,  82, -90,  68, -82,  50, -63,  26, -34,
+ 21,  26,  60,  68,  84,  89,  89,  80,  74,  46,  42,  -4,   0, -53, -42, -84,  // 8
+-74, -87, -89, -63, -84, -17, -60,  34, -21,  74,  21,  90,  60,  77,  84,  38,
+ 89, -13,  74, -60,  42, -86,   0, -85, -42, -56, -74,  -9, -89,  42, -84,  78,
+-60,  90, -21,  72,  21,  30,  60, -21,  84, -66,  89, -88,  74, -82,  42, -50,
+ 30,  34,  77,  82,  89,  84,  63,  38,   9, -30, -50, -80, -85, -85, -84, -42,
+-46,  26,  13,  78,  66,  86,  90,  46,  74, -21,  26, -77, -34, -87, -78, -50,
+-88,  17, -60,  74,  -4,  88,  53,  53,  86, -13,  82, -72,  42, -89, -17, -56,
+-68,   9, -90,  68, -72,  90, -21,  60,  38,  -4,  80, -66,  87, -90,  56, -63,
+ 38,  42,  86,  89,  74,  60,   9, -21, -63, -84, -90, -74, -53,   0,  21,  74,  // 16
+ 80,  84,  82,  21,  26, -60, -50, -89, -89, -42, -66,  42,   4,  89,  72,  60,
+ 87, -21,  42, -84, -34, -74, -85,   0, -77,  74, -13,  84,  60,  21,  90, -60,
+ 56, -89, -17, -42, -78,  42, -84,  89, -30,  60,  46, -21,  88, -84,  68, -74,
+ 46,  50,  90,  88,  42,  21, -50, -72, -90, -78, -38,   9,  53,  85,  89,  60,
+ 34, -38, -56, -90, -88, -34, -30,  63,  60,  84,  87,   4,  26, -80, -63, -68,
+-86,  26, -21,  89,  66,  46,  85, -53,  17, -87, -68, -17, -84,  74, -13,  77,
+ 72, -13,  82, -86,   9, -56, -74,  42, -80,  90,  -4,  30,  77, -66,  78, -82,
+ 53,  56,  85,  80,   0, -21, -85, -90, -53, -17,  53,  82,  85,  53,   0, -60,  // 24
+-85, -78, -53,  26,  53,  90,  85,  13,   0, -84, -85, -50, -53,  63,  53,  77,
+ 85, -30,   0, -89, -85,  -9, -53,  85,  53,  46,  85, -66,   0, -74, -85,  34,
+-53,  88,  53,   4,  85, -86,   0, -42, -85,  68, -53,  72,  53, -38,  85, -87,
+ 60,  63,  74,  66, -42, -60, -84, -68,  21,  56,  89,  72,   0, -53, -89, -74,
+-21,  50,  84,  77,  42, -46, -74, -78, -60,  42,  60,  80,  74, -38, -42, -82,
+-84,  34,  21,  84,  89, -30,   0, -85, -89,  26, -21,  86,  84, -21,  42, -87,
+-74,  17, -60,  88,  60, -13,  74, -89, -42,   9, -84,  90,  21,  -4,  89, -90,
+ 66,  68,  56,  46, -74, -84, -46, -17,  80,  90,  34, -13, -85, -85, -21,  42,  // 32
+ 88,  72,   9, -66, -90, -50,   4,  82,  89,  21, -17, -90, -86,   9,  30,  86,
+ 82, -38, -42, -74, -77,  63,  53,  53,  68, -80, -63, -26, -60,  89,  72,  -4,
+ 50, -87, -78,  34, -38,  77,  84, -60,  26, -56, -87,  78, -13,  30,  90, -88,
+ 72,  74,  34,  21, -89, -89,  13,  42,  82,  60, -56, -84, -53,   0,  84,  84,
+  9, -60, -88, -42,  38,  89,  68, -21, -74, -74, -30,  74,  90,  21, -17, -89,
+-80,  42,  60,  60,  50, -84, -85,   0,  -4,  84,  87, -60, -42, -42, -66,  89,
+ 77, -21,  26, -74, -90,  74,  21,  21,  78, -89, -63,  42, -46,  60,  86, -84,
+ 77,  78,   9,  -4, -84, -74,  66,  82,  26, -13, -88, -68,  53,  85,  42, -21,  // 40
+-90, -63,  38,  87,  56, -30, -87, -56,  21,  89,  68, -38, -82, -50,   4,  90,
+ 78, -46, -74, -42, -13,  90,  85, -53, -63, -34, -30,  88,  89, -60, -50, -26,
+-46,  86,  90, -66, -34, -17, -60,  84,  86, -72, -17,  -9, -72,  80,  80, -77,
+ 80,  82, -17, -30, -60, -42,  90,  86, -50, -77, -30,  17,  85,  53, -74, -89,
+  4,  68,  68,  -4, -87, -63,  38,  90,  42, -60, -88,  -9,  66,  72,   9, -88,
+-77,  50,  84,  21, -26, -78, -53,  85,  90, -38, -56, -34, -21,  84,  82, -80,
+-78,  26,  13,  46,  63, -87, -89,  74,  46, -13,  34, -56, -86,  90,  72, -66,
+ 84,  85, -42, -53, -21,   0,  74,  53, -89, -85,  60,  85,   0, -53, -60,   0,  // 48
+ 89,  53, -74, -85,  21,  85,  42, -53, -84,   0,  84,  53, -42, -85, -21,  85,
+ 74, -53, -89,   0,  60,  53,   0, -85, -60,  85,  89, -53, -74,   0,  21,  53,
+ 42, -85, -84,  85,  84, -53, -42,   0, -21,  53,  74, -85, -89,  85,  60, -53,
+ 86,  87, -63, -72,  21,  42,  26,  -4, -66, -34,  87,  66, -85, -85,  60,  89,
+-17, -77, -30,  50,  68, -13, -88, -26,  84,  60, -56, -82,  13,  90,  34, -80,
+-72,  56,  89, -21, -82, -17,  53,  53,  -9, -78, -38,  90,  74, -84, -90,  63,
+ 80, -30, -50,  -9,   4,  46,  42, -74, -77,  88,  90, -86, -78,  68,  46, -38,
+ 88,  89, -78, -84,  60,  74, -34, -60,   4,  42,  26, -21, -53,   0,  74,  21,  // 56
+-86, -42,  90,  60, -82, -74,  66,  84, -42, -89,  13,  89,  17, -84, -46,  74,
+ 68, -60, -84,  42,  90, -21, -85,   0,  72,  21, -50, -42,  21,  60,   9, -74,
+-38,  84,  63, -89, -80,  89,  89, -84, -87,  74,  77, -60, -56,  42,  30, -21,
+ 90,  90, -87, -90,  84,  89, -78, -88,  72,  87, -63, -86,  53,  85, -42, -84,
+ 30,  82, -17, -80,   4,  78,   9, -77, -21,  74,  34, -72, -46,  68,  56, -66,
+-66,  63,  74, -60, -80,  56,  85, -53, -88,  50,  90, -46, -89,  42,  86, -38,
+-82,  34,  77, -30, -68,  26,  60, -21, -50,  17,  38, -13, -26,   9,  13,  -4,
+};
+
+ALIGNED(32) const int16_t  ff_dct8_32x32_coeff_ver[1024] = {
+ 90,  90,  90,  87,  89,  84,  88,  78,  87,  72,  86,  63,  85,  53,  84,  42,  // 0
+ 82,  30,  80,  17,  78,   4,  77,  -9,  74, -21,  72, -34,  68, -46,  66, -56,
+ 63, -66,  60, -74,  56, -80,  53, -85,  50, -88,  46, -90,  42, -89,  38, -86,
+ 34, -82,  30, -77,  26, -68,  21, -60,  17, -50,  13, -38,   9, -26,   4, -13,
+ 89,  88,  84,  78,  74,  60,  60,  34,  42,   4,  21, -26,   0, -53, -21, -74,
+-42, -86, -60, -90, -74, -82, -84, -66, -89, -42, -89, -13, -84,  17, -74,  46,
+-60,  68, -42,  84, -21,  90,   0,  85,  21,  72,  42,  50,  60,  21,  74,  -9,
+ 84, -38,  89, -63,  89, -80,  84, -89,  74, -87,  60, -77,  42, -56,  21, -30,
+ 87,  86,  72,  63,  42,  21,   4, -26, -34, -66, -66, -87, -85, -85, -89, -60,  // 8
+-77, -17, -50,  30, -13,  68,  26,  88,  60,  84,  82,  56,  90,  13,  80, -34,
+ 56, -72,  21, -89, -17, -82, -53, -53, -78,  -9, -90,  38, -84,  74, -63,  90,
+-30,  80,   9,  50,  46,   4,  74, -42,  88, -77,  86, -90,  68, -78,  38, -46,
+ 85,  84,  53,  42,   0, -21, -53, -74, -85, -89, -85, -60, -53,   0,   0,  60,
+ 53,  89,  85,  74,  85,  21,  53, -42,   0, -84, -53, -84, -85, -42, -85,  21,
+-53,  74,   0,  89,  53,  60,  85,   0,  85, -60,  53, -89,   0, -74, -53, -21,
+-85,  42, -85,  84, -53,  84,   0,  42,  53, -21,  85, -74,  85, -89,  53, -60,
+ 82,  80,  30,  17, -42, -60, -86, -90, -77, -50, -17,  30,  53,  85,  89,  74,  // 16
+ 68,   4,   4, -68, -63, -87, -90, -38, -60,  42,   9,  88,  72,  66,  88,  -9,
+ 50, -77, -21, -84, -78, -26, -85,  53, -38,  90,  34,  56,  84, -21,  80, -82,
+ 26, -78, -46, -13, -87,  63, -74,  89, -13,  46,  56, -34,  90, -86,  66, -72,
+ 78,  77,   4,  -9, -74, -84, -82, -66, -13,  26,  68,  88,  85,  53,  21, -42,
+-63, -90, -87, -38, -30,  56,  56,  87,  89,  21,  38, -68, -50, -82, -90,  -4,
+-46,  78,  42,  74,  90, -13,  53, -85, -34, -63, -88,  30, -60,  89,  26,  50,
+ 86, -46,  66, -90, -17, -34, -84,  60, -72,  86,   9,  17,  80, -72,  77, -80,
+ 74,  72, -21, -34, -89, -89, -42, -13,  60,  82,  84,  56,   0, -53, -84, -84,  // 24
+-60,   9,  42,  88,  89,  38,  21, -68, -74, -74, -74,  30,  21,  90,  89,  17,
+ 42, -80, -60, -60, -84,  50,   0,  85,  84,  -4,  60, -87, -42, -42, -89,  66,
+-21,  77,  74, -26,  74, -90, -21, -21, -89,  78, -42,  63,  60, -46,  84, -86,
+ 68,  66, -46, -56, -84, -74,  17,  46,  90,  80,  13, -34, -85, -85, -42,  21,
+ 72,  88,  66,  -9, -50, -90, -82,  -4,  21,  89,  90,  17,   9, -86, -86, -30,
+-38,  82,  74,  42,  63, -77, -53, -53, -80,  68,  26,  63,  89, -60,   4, -72,
+-87,  50, -34,  78,  77, -38,  60, -84, -56,  26, -78,  87,  30, -13,  88, -90,
+ 63,  60, -66, -74, -60, -42,  68,  84,  56,  21, -72, -89, -53,   0,  74,  89,  // 32
+ 50, -21, -77, -84, -46,  42,  78,  74,  42, -60, -80, -60, -38,  74,  82,  42,
+ 34, -84, -84, -21, -30,  89,  85,   0,  26, -89, -86,  21, -21,  84,  87, -42,
+ 17, -74, -88,  60, -13,  60,  89, -74,   9, -42, -90,  84,  -4,  21,  90, -89,
+ 56,  53, -80, -85, -21,   0,  90,  85, -17, -53, -82, -53,  53,  85,  60,   0,
+-78, -85, -26,  53,  90,  53, -13, -85, -84,   0,  50,  85,  63, -53, -77, -53,
+-30,  85,  89,   0,  -9, -85, -85,  53,  46,  53,  66, -85, -74,   0, -34,  85,
+ 88, -53,  -4, -53, -86,  85,  42,   0,  68, -85, -72,  53, -38,  53,  87, -85,
+ 50,  46, -88, -90,  21,  42,  72,  50, -78, -90,  -9,  38,  85,  53, -60, -89,  // 40
+-38,  34,  90,  56, -34, -88, -63,  30,  84,  60,  -4, -87, -80,  26,  68,  63,
+ 26, -86, -89,  21,  46,  66,  53, -85, -87,  17,  17,  68,  74, -84, -77,  13,
+-13,  72,  86, -82, -56,   9, -42,  74,  90, -80, -30,   4, -66,  77,  82, -78,
+ 42,  38, -89, -86,  60,  74,  21,  -9, -84, -63,  74,  90,   0, -53, -74, -21,
+ 84,  80, -21, -82, -60,  26,  89,  50, -42, -89, -42,  66,  89,   4, -60, -72,
+-21,  87,  84, -42, -74, -34,   0,  85,  74, -77, -84,  13,  21,  60,  60, -90,
+-89,  56,  42,  17,  42, -78, -89,  84,  60, -30,  21, -46, -84,  88,  74, -68,
+ 34,  30, -82, -77,  84,  89, -38, -63, -30,   9,  80,  50, -85, -85,  42,  84,  // 48
+ 26, -46, -78, -13,  86,  66, -46, -90, -21,  74,  77, -26, -87, -34,  50,  78,
+ 17, -88, -74,  60,  88,  -4, -53, -53, -13,  86,  72, -82, -89,  42,  56,  17,
+  9, -68, -68,  90,  90, -72, -60,  21,  -4,  38,  66, -80, -90,  87,  63, -56,
+ 26,  21, -68, -60,  89,  84, -80, -89,  46,  74,   4, -42, -53,   0,  84,  42,
+-87, -74,  63,  89, -17, -84, -34,  60,  74, -21, -90, -21,  77,  60, -38, -84,
+-13,  89,  60, -74, -86,  42,  85,   0, -56, -42,   9,  74,  42, -89, -78,  84,
+ 90, -60, -72,  21,  30,  21,  21, -60, -66,  84,  88, -89, -82,  74,  50, -42,
+ 17,  13, -50, -38,  74,  60, -87, -77,  88,  86, -77, -90,  53,  85, -21, -74,  // 56
+-13,  56,  46, -34, -72,   9,  86,  17, -89, -42,  78,  63, -56, -78,  26,  87,
+  9, -90, -42,  84,  68, -72, -85,  53,  90, -30, -80,   4,  60,  21, -30, -46,
+ -4,  66,  38, -80, -66,  88,  84, -89, -90,  82,  82, -68, -63,  50,  34, -26,
+  9,   4, -26, -13,  42,  21, -56, -30,  68,  38, -78, -46,  85,  53, -89, -60,
+ 90,  66, -86, -72,  80,  77, -72, -80,  60,  84, -46, -86,  30,  88, -13, -90,
+ -4,  90,  21, -89, -38,  87,  53, -85, -66,  82,  77, -78, -84,  74,  88, -68,
+-90,  63,  87, -56, -82,  50,  74, -42, -63,  34,  50, -26, -34,  17,  17,  -9,
+};
+
+
+typedef int32_t TCoeff;
+typedef int16_t TMatrixCoeff;
+
+//! \ingroup CommonLib
+//! \{
+
+
+  // DCT-2
+#define DEFINE_DCT2_P2_MATRIX(a) \
+{ \
+  {a,  a}, \
+  {a, -a}  \
+}
+
+#define DEFINE_DCT2_P4_MATRIX(a,b,c) \
+{ \
+  { a,  a,  a,  a}, \
+  { b,  c, -c, -b}, \
+  { a, -a, -a,  a}, \
+  { c, -b,  b, -c}  \
+}
+
+#define DEFINE_DCT2_P8_MATRIX(a,b,c,d,e,f,g) \
+{ \
+  { a,  a,  a,  a,  a,  a,  a,  a}, \
+  { d,  e,  f,  g, -g, -f, -e, -d}, \
+  { b,  c, -c, -b, -b, -c,  c,  b}, \
+  { e, -g, -d, -f,  f,  d,  g, -e}, \
+  { a, -a, -a,  a,  a, -a, -a,  a}, \
+  { f, -d,  g,  e, -e, -g,  d, -f}, \
+  { c, -b,  b, -c, -c,  b, -b,  c}, \
+  { g, -f,  e, -d,  d, -e,  f, -g}  \
+}
+
+#define DEFINE_DCT2_P16_MATRIX(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o) \
+{ \
+  { a,  a,  a,  a,  a,  a,  a,  a,  a,  a,  a,  a,  a,  a,  a,  a}, \
+  { h,  i,  j,  k,  l,  m,  n,  o, -o, -n, -m, -l, -k, -j, -i, -h}, \
+  { d,  e,  f,  g, -g, -f, -e, -d, -d, -e, -f, -g,  g,  f,  e,  d}, \
+  { i,  l,  o, -m, -j, -h, -k, -n,  n,  k,  h,  j,  m, -o, -l, -i}, \
+  { b,  c, -c, -b, -b, -c,  c,  b,  b,  c, -c, -b, -b, -c,  c,  b}, \
+  { j,  o, -k, -i, -n,  l,  h,  m, -m, -h, -l,  n,  i,  k, -o, -j}, \
+  { e, -g, -d, -f,  f,  d,  g, -e, -e,  g,  d,  f, -f, -d, -g,  e}, \
+  { k, -m, -i,  o,  h,  n, -j, -l,  l,  j, -n, -h, -o,  i,  m, -k}, \
+  { a, -a, -a,  a,  a, -a, -a,  a,  a, -a, -a,  a,  a, -a, -a,  a}, \
+  { l, -j, -n,  h, -o, -i,  m,  k, -k, -m,  i,  o, -h,  n,  j, -l}, \
+  { f, -d,  g,  e, -e, -g,  d, -f, -f,  d, -g, -e,  e,  g, -d,  f}, \
+  { m, -h,  l,  n, -i,  k,  o, -j,  j, -o, -k,  i, -n, -l,  h, -m}, \
+  { c, -b,  b, -c, -c,  b, -b,  c,  c, -b,  b, -c, -c,  b, -b,  c}, \
+  { n, -k,  h, -j,  m,  o, -l,  i, -i,  l, -o, -m,  j, -h,  k, -n}, \
+  { g, -f,  e, -d,  d, -e,  f, -g, -g,  f, -e,  d, -d,  e, -f,  g}, \
+  { o, -n,  m, -l,  k, -j,  i, -h,  h, -i,  j, -k,  l, -m,  n, -o}  \
+}
+
+#define DEFINE_DCT2_P32_MATRIX(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,A,B,C,D,E) \
+{ \
+  { a,  a,  a,  a,  a,  a,  a,  a,  a,  a,  a,  a,  a,  a,  a,  a,  a,  a,  a,  a,  a,  a,  a,  a,  a,  a,  a,  a,  a,  a,  a,  a}, \
+  { p,  q,  r,  s,  t,  u,  v,  w,  x,  y,  z,  A,  B,  C,  D,  E, -E, -D, -C, -B, -A, -z, -y, -x, -w, -v, -u, -t, -s, -r, -q, -p}, \
+  { h,  i,  j,  k,  l,  m,  n,  o, -o, -n, -m, -l, -k, -j, -i, -h, -h, -i, -j, -k, -l, -m, -n, -o,  o,  n,  m,  l,  k,  j,  i,  h}, \
+  { q,  t,  w,  z,  C, -E, -B, -y, -v, -s, -p, -r, -u, -x, -A, -D,  D,  A,  x,  u,  r,  p,  s,  v,  y,  B,  E, -C, -z, -w, -t, -q}, \
+  { d,  e,  f,  g, -g, -f, -e, -d, -d, -e, -f, -g,  g,  f,  e,  d,  d,  e,  f,  g, -g, -f, -e, -d, -d, -e, -f, -g,  g,  f,  e,  d}, \
+  { r,  w,  B, -D, -y, -t, -p, -u, -z, -E,  A,  v,  q,  s,  x,  C, -C, -x, -s, -q, -v, -A,  E,  z,  u,  p,  t,  y,  D, -B, -w, -r}, \
+  { i,  l,  o, -m, -j, -h, -k, -n,  n,  k,  h,  j,  m, -o, -l, -i, -i, -l, -o,  m,  j,  h,  k,  n, -n, -k, -h, -j, -m,  o,  l,  i}, \
+  { s,  z, -D, -w, -p, -v, -C,  A,  t,  r,  y, -E, -x, -q, -u, -B,  B,  u,  q,  x,  E, -y, -r, -t, -A,  C,  v,  p,  w,  D, -z, -s}, \
+  { b,  c, -c, -b, -b, -c,  c,  b,  b,  c, -c, -b, -b, -c,  c,  b,  b,  c, -c, -b, -b, -c,  c,  b,  b,  c, -c, -b, -b, -c,  c,  b}, \
+  { t,  C, -y, -p, -x,  D,  u,  s,  B, -z, -q, -w,  E,  v,  r,  A, -A, -r, -v, -E,  w,  q,  z, -B, -s, -u, -D,  x,  p,  y, -C, -t}, \
+  { j,  o, -k, -i, -n,  l,  h,  m, -m, -h, -l,  n,  i,  k, -o, -j, -j, -o,  k,  i,  n, -l, -h, -m,  m,  h,  l, -n, -i, -k,  o,  j}, \
+  { u, -E, -t, -v,  D,  s,  w, -C, -r, -x,  B,  q,  y, -A, -p, -z,  z,  p,  A, -y, -q, -B,  x,  r,  C, -w, -s, -D,  v,  t,  E, -u}, \
+  { e, -g, -d, -f,  f,  d,  g, -e, -e,  g,  d,  f, -f, -d, -g,  e,  e, -g, -d, -f,  f,  d,  g, -e, -e,  g,  d,  f, -f, -d, -g,  e}, \
+  { v, -B, -p, -C,  u,  w, -A, -q, -D,  t,  x, -z, -r, -E,  s,  y, -y, -s,  E,  r,  z, -x, -t,  D,  q,  A, -w, -u,  C,  p,  B, -v}, \
+  { k, -m, -i,  o,  h,  n, -j, -l,  l,  j, -n, -h, -o,  i,  m, -k, -k,  m,  i, -o, -h, -n,  j,  l, -l, -j,  n,  h,  o, -i, -m,  k}, \
+  { w, -y, -u,  A,  s, -C, -q,  E,  p,  D, -r, -B,  t,  z, -v, -x,  x,  v, -z, -t,  B,  r, -D, -p, -E,  q,  C, -s, -A,  u,  y, -w}, \
+  { a, -a, -a,  a,  a, -a, -a,  a,  a, -a, -a,  a,  a, -a, -a,  a,  a, -a, -a,  a,  a, -a, -a,  a,  a, -a, -a,  a,  a, -a, -a,  a}, \
+  { x, -v, -z,  t,  B, -r, -D,  p, -E, -q,  C,  s, -A, -u,  y,  w, -w, -y,  u,  A, -s, -C,  q,  E, -p,  D,  r, -B, -t,  z,  v, -x}, \
+  { l, -j, -n,  h, -o, -i,  m,  k, -k, -m,  i,  o, -h,  n,  j, -l, -l,  j,  n, -h,  o,  i, -m, -k,  k,  m, -i, -o,  h, -n, -j,  l}, \
+  { y, -s, -E,  r, -z, -x,  t,  D, -q,  A,  w, -u, -C,  p, -B, -v,  v,  B, -p,  C,  u, -w, -A,  q, -D, -t,  x,  z, -r,  E,  s, -y}, \
+  { f, -d,  g,  e, -e, -g,  d, -f, -f,  d, -g, -e,  e,  g, -d,  f,  f, -d,  g,  e, -e, -g,  d, -f, -f,  d, -g, -e,  e,  g, -d,  f}, \
+  { z, -p,  A,  y, -q,  B,  x, -r,  C,  w, -s,  D,  v, -t,  E,  u, -u, -E,  t, -v, -D,  s, -w, -C,  r, -x, -B,  q, -y, -A,  p, -z}, \
+  { m, -h,  l,  n, -i,  k,  o, -j,  j, -o, -k,  i, -n, -l,  h, -m, -m,  h, -l, -n,  i, -k, -o,  j, -j,  o,  k, -i,  n,  l, -h,  m}, \
+  { A, -r,  v, -E, -w,  q, -z, -B,  s, -u,  D,  x, -p,  y,  C, -t,  t, -C, -y,  p, -x, -D,  u, -s,  B,  z, -q,  w,  E, -v,  r, -A}, \
+  { c, -b,  b, -c, -c,  b, -b,  c,  c, -b,  b, -c, -c,  b, -b,  c,  c, -b,  b, -c, -c,  b, -b,  c,  c, -b,  b, -c, -c,  b, -b,  c}, \
+  { B, -u,  q, -x,  E,  y, -r,  t, -A, -C,  v, -p,  w, -D, -z,  s, -s,  z,  D, -w,  p, -v,  C,  A, -t,  r, -y, -E,  x, -q,  u, -B}, \
+  { n, -k,  h, -j,  m,  o, -l,  i, -i,  l, -o, -m,  j, -h,  k, -n, -n,  k, -h,  j, -m, -o,  l, -i,  i, -l,  o,  m, -j,  h, -k,  n}, \
+  { C, -x,  s, -q,  v, -A, -E,  z, -u,  p, -t,  y, -D, -B,  w, -r,  r, -w,  B,  D, -y,  t, -p,  u, -z,  E,  A, -v,  q, -s,  x, -C}, \
+  { g, -f,  e, -d,  d, -e,  f, -g, -g,  f, -e,  d, -d,  e, -f,  g,  g, -f,  e, -d,  d, -e,  f, -g, -g,  f, -e,  d, -d,  e, -f,  g}, \
+  { D, -A,  x, -u,  r, -p,  s, -v,  y, -B,  E,  C, -z,  w, -t,  q, -q,  t, -w,  z, -C, -E,  B, -y,  v, -s,  p, -r,  u, -x,  A, -D}, \
+  { o, -n,  m, -l,  k, -j,  i, -h,  h, -i,  j, -k,  l, -m,  n, -o, -o,  n, -m,  l, -k,  j, -i,  h, -h,  i, -j,  k, -l,  m, -n,  o}, \
+  { E, -D,  C, -B,  A, -z,  y, -x,  w, -v,  u, -t,  s, -r,  q, -p,  p, -q,  r, -s,  t, -u,  v, -w,  x, -y,  z, -A,  B, -C,  D, -E}  \
+}
+
+
+#define DEFINE_DCT2_P64_MATRIX(aa, ab, ac, ad, ae, af, ag, ah, ai, aj, ak, al, am, an, ao, ap, aq, ar, as, at, au, av, aw, ax, ay, az, ba, bb, bc, bd, be, bf, bg, bh, bi, bj, bk, bl, bm, bn, bo, bp, bq, br, bs, bt, bu, bv, bw, bx, by, bz, ca, cb, cc, cd, ce, cf, cg, ch, ci, cj, ck) \
+{ \
+  { aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa }, \
+  { bf,  bg,  bh,  bi,  bj,  bk,  bl,  bm,  bn,  bo,  bp,  bq,  br,  bs,  bt,  bu,  bv,  bw,  bx,  by,  bz,  ca,  cb,  cc,  cd,  ce,  cf,  cg,  ch,  ci,  cj,  ck, -ck, -cj, -ci, -ch, -cg, -cf, -ce, -cd, -cc, -cb, -ca, -bz, -by, -bx, -bw, -bv, -bu, -bt, -bs, -br, -bq, -bp, -bo, -bn, -bm, -bl, -bk, -bj, -bi, -bh, -bg, -bf }, \
+  { ap,  aq,  ar,  as,  at,  au,  av,  aw,  ax,  ay,  az,  ba,  bb,  bc,  bd,  be, -be, -bd, -bc, -bb, -ba, -az, -ay, -ax, -aw, -av, -au, -at, -as, -ar, -aq, -ap, -ap, -aq, -ar, -as, -at, -au, -av, -aw, -ax, -ay, -az, -ba, -bb, -bc, -bd, -be,  be,  bd,  bc,  bb,  ba,  az,  ay,  ax,  aw,  av,  au,  at,  as,  ar,  aq,  ap }, \
+  { bg,  bj,  bm,  bp,  bs,  bv,  by,  cb,  ce,  ch,  ck, -ci, -cf, -cc, -bz, -bw, -bt, -bq, -bn, -bk, -bh, -bf, -bi, -bl, -bo, -br, -bu, -bx, -ca, -cd, -cg, -cj,  cj,  cg,  cd,  ca,  bx,  bu,  br,  bo,  bl,  bi,  bf,  bh,  bk,  bn,  bq,  bt,  bw,  bz,  cc,  cf,  ci, -ck, -ch, -ce, -cb, -by, -bv, -bs, -bp, -bm, -bj, -bg }, \
+  { ah,  ai,  aj,  ak,  al,  am,  an,  ao, -ao, -an, -am, -al, -ak, -aj, -ai, -ah, -ah, -ai, -aj, -ak, -al, -am, -an, -ao,  ao,  an,  am,  al,  ak,  aj,  ai,  ah,  ah,  ai,  aj,  ak,  al,  am,  an,  ao, -ao, -an, -am, -al, -ak, -aj, -ai, -ah, -ah, -ai, -aj, -ak, -al, -am, -an, -ao,  ao,  an,  am,  al,  ak,  aj,  ai,  ah }, \
+  { bh,  bm,  br,  bw,  cb,  cg, -ck, -cf, -ca, -bv, -bq, -bl, -bg, -bi, -bn, -bs, -bx, -cc, -ch,  cj,  ce,  bz,  bu,  bp,  bk,  bf,  bj,  bo,  bt,  by,  cd,  ci, -ci, -cd, -by, -bt, -bo, -bj, -bf, -bk, -bp, -bu, -bz, -ce, -cj,  ch,  cc,  bx,  bs,  bn,  bi,  bg,  bl,  bq,  bv,  ca,  cf,  ck, -cg, -cb, -bw, -br, -bm, -bh }, \
+  { aq,  at,  aw,  az,  bc, -be, -bb, -ay, -av, -as, -ap, -ar, -au, -ax, -ba, -bd,  bd,  ba,  ax,  au,  ar,  ap,  as,  av,  ay,  bb,  be, -bc, -az, -aw, -at, -aq, -aq, -at, -aw, -az, -bc,  be,  bb,  ay,  av,  as,  ap,  ar,  au,  ax,  ba,  bd, -bd, -ba, -ax, -au, -ar, -ap, -as, -av, -ay, -bb, -be,  bc,  az,  aw,  at,  aq }, \
+  { bi,  bp,  bw,  cd,  ck, -ce, -bx, -bq, -bj, -bh, -bo, -bv, -cc, -cj,  cf,  by,  br,  bk,  bg,  bn,  bu,  cb,  ci, -cg, -bz, -bs, -bl, -bf, -bm, -bt, -ca, -ch,  ch,  ca,  bt,  bm,  bf,  bl,  bs,  bz,  cg, -ci, -cb, -bu, -bn, -bg, -bk, -br, -by, -cf,  cj,  cc,  bv,  bo,  bh,  bj,  bq,  bx,  ce, -ck, -cd, -bw, -bp, -bi }, \
+  { ad,  ae,  af,  ag, -ag, -af, -ae, -ad, -ad, -ae, -af, -ag,  ag,  af,  ae,  ad,  ad,  ae,  af,  ag, -ag, -af, -ae, -ad, -ad, -ae, -af, -ag,  ag,  af,  ae,  ad,  ad,  ae,  af,  ag, -ag, -af, -ae, -ad, -ad, -ae, -af, -ag,  ag,  af,  ae,  ad,  ad,  ae,  af,  ag, -ag, -af, -ae, -ad, -ad, -ae, -af, -ag,  ag,  af,  ae,  ad }, \
+  { bj,  bs,  cb,  ck, -cc, -bt, -bk, -bi, -br, -ca, -cj,  cd,  bu,  bl,  bh,  bq,  bz,  ci, -ce, -bv, -bm, -bg, -bp, -by, -ch,  cf,  bw,  bn,  bf,  bo,  bx,  cg, -cg, -bx, -bo, -bf, -bn, -bw, -cf,  ch,  by,  bp,  bg,  bm,  bv,  ce, -ci, -bz, -bq, -bh, -bl, -bu, -cd,  cj,  ca,  br,  bi,  bk,  bt,  cc, -ck, -cb, -bs, -bj }, \
+  { ar,  aw,  bb, -bd, -ay, -at, -ap, -au, -az, -be,  ba,  av,  aq,  as,  ax,  bc, -bc, -ax, -as, -aq, -av, -ba,  be,  az,  au,  ap,  at,  ay,  bd, -bb, -aw, -ar, -ar, -aw, -bb,  bd,  ay,  at,  ap,  au,  az,  be, -ba, -av, -aq, -as, -ax, -bc,  bc,  ax,  as,  aq,  av,  ba, -be, -az, -au, -ap, -at, -ay, -bd,  bb,  aw,  ar }, \
+  { bk,  bv,  cg, -ce, -bt, -bi, -bm, -bx, -ci,  cc,  br,  bg,  bo,  bz,  ck, -ca, -bp, -bf, -bq, -cb,  cj,  by,  bn,  bh,  bs,  cd, -ch, -bw, -bl, -bj, -bu, -cf,  cf,  bu,  bj,  bl,  bw,  ch, -cd, -bs, -bh, -bn, -by, -cj,  cb,  bq,  bf,  bp,  ca, -ck, -bz, -bo, -bg, -br, -cc,  ci,  bx,  bm,  bi,  bt,  ce, -cg, -bv, -bk }, \
+  { ai,  al,  ao, -am, -aj, -ah, -ak, -an,  an,  ak,  ah,  aj,  am, -ao, -al, -ai, -ai, -al, -ao,  am,  aj,  ah,  ak,  an, -an, -ak, -ah, -aj, -am,  ao,  al,  ai,  ai,  al,  ao, -am, -aj, -ah, -ak, -an,  an,  ak,  ah,  aj,  am, -ao, -al, -ai, -ai, -al, -ao,  am,  aj,  ah,  ak,  an, -an, -ak, -ah, -aj, -am,  ao,  al,  ai }, \
+  { bl,  by, -ck, -bx, -bk, -bm, -bz,  cj,  bw,  bj,  bn,  ca, -ci, -bv, -bi, -bo, -cb,  ch,  bu,  bh,  bp,  cc, -cg, -bt, -bg, -bq, -cd,  cf,  bs,  bf,  br,  ce, -ce, -br, -bf, -bs, -cf,  cd,  bq,  bg,  bt,  cg, -cc, -bp, -bh, -bu, -ch,  cb,  bo,  bi,  bv,  ci, -ca, -bn, -bj, -bw, -cj,  bz,  bm,  bk,  bx,  ck, -by, -bl }, \
+  { as,  az, -bd, -aw, -ap, -av, -bc,  ba,  at,  ar,  ay, -be, -ax, -aq, -au, -bb,  bb,  au,  aq,  ax,  be, -ay, -ar, -at, -ba,  bc,  av,  ap,  aw,  bd, -az, -as, -as, -az,  bd,  aw,  ap,  av,  bc, -ba, -at, -ar, -ay,  be,  ax,  aq,  au,  bb, -bb, -au, -aq, -ax, -be,  ay,  ar,  at,  ba, -bc, -av, -ap, -aw, -bd,  az,  as }, \
+  { bm,  cb, -cf, -bq, -bi, -bx,  cj,  bu,  bf,  bt,  ci, -by, -bj, -bp, -ce,  cc,  bn,  bl,  ca, -cg, -br, -bh, -bw,  ck,  bv,  bg,  bs,  ch, -bz, -bk, -bo, -cd,  cd,  bo,  bk,  bz, -ch, -bs, -bg, -bv, -ck,  bw,  bh,  br,  cg, -ca, -bl, -bn, -cc,  ce,  bp,  bj,  by, -ci, -bt, -bf, -bu, -cj,  bx,  bi,  bq,  cf, -cb, -bm }, \
+  { ab,  ac, -ac, -ab, -ab, -ac,  ac,  ab,  ab,  ac, -ac, -ab, -ab, -ac,  ac,  ab,  ab,  ac, -ac, -ab, -ab, -ac,  ac,  ab,  ab,  ac, -ac, -ab, -ab, -ac,  ac,  ab,  ab,  ac, -ac, -ab, -ab, -ac,  ac,  ab,  ab,  ac, -ac, -ab, -ab, -ac,  ac,  ab,  ab,  ac, -ac, -ab, -ab, -ac,  ac,  ab,  ab,  ac, -ac, -ab, -ab, -ac,  ac,  ab }, \
+  { bn,  ce, -ca, -bj, -br, -ci,  bw,  bf,  bv, -cj, -bs, -bi, -bz,  cf,  bo,  bm,  cd, -cb, -bk, -bq, -ch,  bx,  bg,  bu, -ck, -bt, -bh, -by,  cg,  bp,  bl,  cc, -cc, -bl, -bp, -cg,  by,  bh,  bt,  ck, -bu, -bg, -bx,  ch,  bq,  bk,  cb, -cd, -bm, -bo, -cf,  bz,  bi,  bs,  cj, -bv, -bf, -bw,  ci,  br,  bj,  ca, -ce, -bn }, \
+  { at,  bc, -ay, -ap, -ax,  bd,  au,  as,  bb, -az, -aq, -aw,  be,  av,  ar,  ba, -ba, -ar, -av, -be,  aw,  aq,  az, -bb, -as, -au, -bd,  ax,  ap,  ay, -bc, -at, -at, -bc,  ay,  ap,  ax, -bd, -au, -as, -bb,  az,  aq,  aw, -be, -av, -ar, -ba,  ba,  ar,  av,  be, -aw, -aq, -az,  bb,  as,  au,  bd, -ax, -ap, -ay,  bc,  at }, \
+  { bo,  ch, -bv, -bh, -ca,  cc,  bj,  bt, -cj, -bq, -bm, -cf,  bx,  bf,  by, -ce, -bl, -br, -ck,  bs,  bk,  cd, -bz, -bg, -bw,  cg,  bn,  bp,  ci, -bu, -bi, -cb,  cb,  bi,  bu, -ci, -bp, -bn, -cg,  bw,  bg,  bz, -cd, -bk, -bs,  ck,  br,  bl,  ce, -by, -bf, -bx,  cf,  bm,  bq,  cj, -bt, -bj, -cc,  ca,  bh,  bv, -ch, -bo }, \
+  { aj,  ao, -ak, -ai, -an,  al,  ah,  am, -am, -ah, -al,  an,  ai,  ak, -ao, -aj, -aj, -ao,  ak,  ai,  an, -al, -ah, -am,  am,  ah,  al, -an, -ai, -ak,  ao,  aj,  aj,  ao, -ak, -ai, -an,  al,  ah,  am, -am, -ah, -al,  an,  ai,  ak, -ao, -aj, -aj, -ao,  ak,  ai,  an, -al, -ah, -am,  am,  ah,  al, -an, -ai, -ak,  ao,  aj }, \
+  { bp,  ck, -bq, -bo, -cj,  br,  bn,  ci, -bs, -bm, -ch,  bt,  bl,  cg, -bu, -bk, -cf,  bv,  bj,  ce, -bw, -bi, -cd,  bx,  bh,  cc, -by, -bg, -cb,  bz,  bf,  ca, -ca, -bf, -bz,  cb,  bg,  by, -cc, -bh, -bx,  cd,  bi,  bw, -ce, -bj, -bv,  cf,  bk,  bu, -cg, -bl, -bt,  ch,  bm,  bs, -ci, -bn, -br,  cj,  bo,  bq, -ck, -bp }, \
+  { au, -be, -at, -av,  bd,  as,  aw, -bc, -ar, -ax,  bb,  aq,  ay, -ba, -ap, -az,  az,  ap,  ba, -ay, -aq, -bb,  ax,  ar,  bc, -aw, -as, -bd,  av,  at,  be, -au, -au,  be,  at,  av, -bd, -as, -aw,  bc,  ar,  ax, -bb, -aq, -ay,  ba,  ap,  az, -az, -ap, -ba,  ay,  aq,  bb, -ax, -ar, -bc,  aw,  as,  bd, -av, -at, -be,  au }, \
+  { bq, -ci, -bl, -bv,  cd,  bg,  ca, -by, -bi, -cf,  bt,  bn,  ck, -bo, -bs,  cg,  bj,  bx, -cb, -bf, -cc,  bw,  bk,  ch, -br, -bp,  cj,  bm,  bu, -ce, -bh, -bz,  bz,  bh,  ce, -bu, -bm, -cj,  bp,  br, -ch, -bk, -bw,  cc,  bf,  cb, -bx, -bj, -cg,  bs,  bo, -ck, -bn, -bt,  cf,  bi,  by, -ca, -bg, -cd,  bv,  bl,  ci, -bq }, \
+  { ae, -ag, -ad, -af,  af,  ad,  ag, -ae, -ae,  ag,  ad,  af, -af, -ad, -ag,  ae,  ae, -ag, -ad, -af,  af,  ad,  ag, -ae, -ae,  ag,  ad,  af, -af, -ad, -ag,  ae,  ae, -ag, -ad, -af,  af,  ad,  ag, -ae, -ae,  ag,  ad,  af, -af, -ad, -ag,  ae,  ae, -ag, -ad, -af,  af,  ad,  ag, -ae, -ae,  ag,  ad,  af, -af, -ad, -ag,  ae }, \
+  { br, -cf, -bg, -cc,  bu,  bo, -ci, -bj, -bz,  bx,  bl,  ck, -bm, -bw,  ca,  bi,  ch, -bp, -bt,  cd,  bf,  ce, -bs, -bq,  cg,  bh,  cb, -bv, -bn,  cj,  bk,  by, -by, -bk, -cj,  bn,  bv, -cb, -bh, -cg,  bq,  bs, -ce, -bf, -cd,  bt,  bp, -ch, -bi, -ca,  bw,  bm, -ck, -bl, -bx,  bz,  bj,  ci, -bo, -bu,  cc,  bg,  cf, -br }, \
+  { av, -bb, -ap, -bc,  au,  aw, -ba, -aq, -bd,  at,  ax, -az, -ar, -be,  as,  ay, -ay, -as,  be,  ar,  az, -ax, -at,  bd,  aq,  ba, -aw, -au,  bc,  ap,  bb, -av, -av,  bb,  ap,  bc, -au, -aw,  ba,  aq,  bd, -at, -ax,  az,  ar,  be, -as, -ay,  ay,  as, -be, -ar, -az,  ax,  at, -bd, -aq, -ba,  aw,  au, -bc, -ap, -bb,  av }, \
+  { bs, -cc, -bi, -cj,  bl,  bz, -bv, -bp,  cf,  bf,  cg, -bo, -bw,  by,  bm, -ci, -bh, -cd,  br,  bt, -cb, -bj, -ck,  bk,  ca, -bu, -bq,  ce,  bg,  ch, -bn, -bx,  bx,  bn, -ch, -bg, -ce,  bq,  bu, -ca, -bk,  ck,  bj,  cb, -bt, -br,  cd,  bh,  ci, -bm, -by,  bw,  bo, -cg, -bf, -cf,  bp,  bv, -bz, -bl,  cj,  bi,  cc, -bs }, \
+  { ak, -am, -ai,  ao,  ah,  an, -aj, -al,  al,  aj, -an, -ah, -ao,  ai,  am, -ak, -ak,  am,  ai, -ao, -ah, -an,  aj,  al, -al, -aj,  an,  ah,  ao, -ai, -am,  ak,  ak, -am, -ai,  ao,  ah,  an, -aj, -al,  al,  aj, -an, -ah, -ao,  ai,  am, -ak, -ak,  am,  ai, -ao, -ah, -an,  aj,  al, -al, -aj,  an,  ah,  ao, -ai, -am,  ak }, \
+  { bt, -bz, -bn,  cf,  bh,  ck, -bi, -ce,  bo,  by, -bu, -bs,  ca,  bm, -cg, -bg, -cj,  bj,  cd, -bp, -bx,  bv,  br, -cb, -bl,  ch,  bf,  ci, -bk, -cc,  bq,  bw, -bw, -bq,  cc,  bk, -ci, -bf, -ch,  bl,  cb, -br, -bv,  bx,  bp, -cd, -bj,  cj,  bg,  cg, -bm, -ca,  bs,  bu, -by, -bo,  ce,  bi, -ck, -bh, -cf,  bn,  bz, -bt }, \
+  { aw, -ay, -au,  ba,  as, -bc, -aq,  be,  ap,  bd, -ar, -bb,  at,  az, -av, -ax,  ax,  av, -az, -at,  bb,  ar, -bd, -ap, -be,  aq,  bc, -as, -ba,  au,  ay, -aw, -aw,  ay,  au, -ba, -as,  bc,  aq, -be, -ap, -bd,  ar,  bb, -at, -az,  av,  ax, -ax, -av,  az,  at, -bb, -ar,  bd,  ap,  be, -aq, -bc,  as,  ba, -au, -ay,  aw }, \
+  { bu, -bw, -bs,  by,  bq, -ca, -bo,  cc,  bm, -ce, -bk,  cg,  bi, -ci, -bg,  ck,  bf,  cj, -bh, -ch,  bj,  cf, -bl, -cd,  bn,  cb, -bp, -bz,  br,  bx, -bt, -bv,  bv,  bt, -bx, -br,  bz,  bp, -cb, -bn,  cd,  bl, -cf, -bj,  ch,  bh, -cj, -bf, -ck,  bg,  ci, -bi, -cg,  bk,  ce, -bm, -cc,  bo,  ca, -bq, -by,  bs,  bw, -bu }, \
+  { aa, -aa, -aa,  aa,  aa, -aa, -aa,  aa,  aa, -aa, -aa,  aa,  aa, -aa, -aa,  aa,  aa, -aa, -aa,  aa,  aa, -aa, -aa,  aa,  aa, -aa, -aa,  aa,  aa, -aa, -aa,  aa,  aa, -aa, -aa,  aa,  aa, -aa, -aa,  aa,  aa, -aa, -aa,  aa,  aa, -aa, -aa,  aa,  aa, -aa, -aa,  aa,  aa, -aa, -aa,  aa,  aa, -aa, -aa,  aa,  aa, -aa, -aa,  aa }, \
+  { bv, -bt, -bx,  br,  bz, -bp, -cb,  bn,  cd, -bl, -cf,  bj,  ch, -bh, -cj,  bf, -ck, -bg,  ci,  bi, -cg, -bk,  ce,  bm, -cc, -bo,  ca,  bq, -by, -bs,  bw,  bu, -bu, -bw,  bs,  by, -bq, -ca,  bo,  cc, -bm, -ce,  bk,  cg, -bi, -ci,  bg,  ck, -bf,  cj,  bh, -ch, -bj,  cf,  bl, -cd, -bn,  cb,  bp, -bz, -br,  bx,  bt, -bv }, \
+  { ax, -av, -az,  at,  bb, -ar, -bd,  ap, -be, -aq,  bc,  as, -ba, -au,  ay,  aw, -aw, -ay,  au,  ba, -as, -bc,  aq,  be, -ap,  bd,  ar, -bb, -at,  az,  av, -ax, -ax,  av,  az, -at, -bb,  ar,  bd, -ap,  be,  aq, -bc, -as,  ba,  au, -ay, -aw,  aw,  ay, -au, -ba,  as,  bc, -aq, -be,  ap, -bd, -ar,  bb,  at, -az, -av,  ax }, \
+  { bw, -bq, -cc,  bk,  ci, -bf,  ch,  bl, -cb, -br,  bv,  bx, -bp, -cd,  bj,  cj, -bg,  cg,  bm, -ca, -bs,  bu,  by, -bo, -ce,  bi,  ck, -bh,  cf,  bn, -bz, -bt,  bt,  bz, -bn, -cf,  bh, -ck, -bi,  ce,  bo, -by, -bu,  bs,  ca, -bm, -cg,  bg, -cj, -bj,  cd,  bp, -bx, -bv,  br,  cb, -bl, -ch,  bf, -ci, -bk,  cc,  bq, -bw }, \
+  { al, -aj, -an,  ah, -ao, -ai,  am,  ak, -ak, -am,  ai,  ao, -ah,  an,  aj, -al, -al,  aj,  an, -ah,  ao,  ai, -am, -ak,  ak,  am, -ai, -ao,  ah, -an, -aj,  al,  al, -aj, -an,  ah, -ao, -ai,  am,  ak, -ak, -am,  ai,  ao, -ah,  an,  aj, -al, -al,  aj,  an, -ah,  ao,  ai, -am, -ak,  ak,  am, -ai, -ao,  ah, -an, -aj,  al }, \
+  { bx, -bn, -ch,  bg, -ce, -bq,  bu,  ca, -bk, -ck,  bj, -cb, -bt,  br,  cd, -bh,  ci,  bm, -by, -bw,  bo,  cg, -bf,  cf,  bp, -bv, -bz,  bl,  cj, -bi,  cc,  bs, -bs, -cc,  bi, -cj, -bl,  bz,  bv, -bp, -cf,  bf, -cg, -bo,  bw,  by, -bm, -ci,  bh, -cd, -br,  bt,  cb, -bj,  ck,  bk, -ca, -bu,  bq,  ce, -bg,  ch,  bn, -bx }, \
+  { ay, -as, -be,  ar, -az, -ax,  at,  bd, -aq,  ba,  aw, -au, -bc,  ap, -bb, -av,  av,  bb, -ap,  bc,  au, -aw, -ba,  aq, -bd, -at,  ax,  az, -ar,  be,  as, -ay, -ay,  as,  be, -ar,  az,  ax, -at, -bd,  aq, -ba, -aw,  au,  bc, -ap,  bb,  av, -av, -bb,  ap, -bc, -au,  aw,  ba, -aq,  bd,  at, -ax, -az,  ar, -be, -as,  ay }, \
+  { by, -bk,  cj,  bn, -bv, -cb,  bh, -cg, -bq,  bs,  ce, -bf,  cd,  bt, -bp, -ch,  bi, -ca, -bw,  bm,  ck, -bl,  bx,  bz, -bj,  ci,  bo, -bu, -cc,  bg, -cf, -br,  br,  cf, -bg,  cc,  bu, -bo, -ci,  bj, -bz, -bx,  bl, -ck, -bm,  bw,  ca, -bi,  ch,  bp, -bt, -cd,  bf, -ce, -bs,  bq,  cg, -bh,  cb,  bv, -bn, -cj,  bk, -by }, \
+  { af, -ad,  ag,  ae, -ae, -ag,  ad, -af, -af,  ad, -ag, -ae,  ae,  ag, -ad,  af,  af, -ad,  ag,  ae, -ae, -ag,  ad, -af, -af,  ad, -ag, -ae,  ae,  ag, -ad,  af,  af, -ad,  ag,  ae, -ae, -ag,  ad, -af, -af,  ad, -ag, -ae,  ae,  ag, -ad,  af,  af, -ad,  ag,  ae, -ae, -ag,  ad, -af, -af,  ad, -ag, -ae,  ae,  ag, -ad,  af }, \
+  { bz, -bh,  ce,  bu, -bm,  cj,  bp, -br, -ch,  bk, -bw, -cc,  bf, -cb, -bx,  bj, -cg, -bs,  bo,  ck, -bn,  bt,  cf, -bi,  by,  ca, -bg,  cd,  bv, -bl,  ci,  bq, -bq, -ci,  bl, -bv, -cd,  bg, -ca, -by,  bi, -cf, -bt,  bn, -ck, -bo,  bs,  cg, -bj,  bx,  cb, -bf,  cc,  bw, -bk,  ch,  br, -bp, -cj,  bm, -bu, -ce,  bh, -bz }, \
+  { az, -ap,  ba,  ay, -aq,  bb,  ax, -ar,  bc,  aw, -as,  bd,  av, -at,  be,  au, -au, -be,  at, -av, -bd,  as, -aw, -bc,  ar, -ax, -bb,  aq, -ay, -ba,  ap, -az, -az,  ap, -ba, -ay,  aq, -bb, -ax,  ar, -bc, -aw,  as, -bd, -av,  at, -be, -au,  au,  be, -at,  av,  bd, -as,  aw,  bc, -ar,  ax,  bb, -aq,  ay,  ba, -ap,  az }, \
+  { ca, -bf,  bz,  cb, -bg,  by,  cc, -bh,  bx,  cd, -bi,  bw,  ce, -bj,  bv,  cf, -bk,  bu,  cg, -bl,  bt,  ch, -bm,  bs,  ci, -bn,  br,  cj, -bo,  bq,  ck, -bp,  bp, -ck, -bq,  bo, -cj, -br,  bn, -ci, -bs,  bm, -ch, -bt,  bl, -cg, -bu,  bk, -cf, -bv,  bj, -ce, -bw,  bi, -cd, -bx,  bh, -cc, -by,  bg, -cb, -bz,  bf, -ca }, \
+  { am, -ah,  al,  an, -ai,  ak,  ao, -aj,  aj, -ao, -ak,  ai, -an, -al,  ah, -am, -am,  ah, -al, -an,  ai, -ak, -ao,  aj, -aj,  ao,  ak, -ai,  an,  al, -ah,  am,  am, -ah,  al,  an, -ai,  ak,  ao, -aj,  aj, -ao, -ak,  ai, -an, -al,  ah, -am, -am,  ah, -al, -an,  ai, -ak, -ao,  aj, -aj,  ao,  ak, -ai,  an,  al, -ah,  am }, \
+  { cb, -bi,  bu,  ci, -bp,  bn, -cg, -bw,  bg, -bz, -cd,  bk, -bs, -ck,  br, -bl,  ce,  by, -bf,  bx,  cf, -bm,  bq, -cj, -bt,  bj, -cc, -ca,  bh, -bv, -ch,  bo, -bo,  ch,  bv, -bh,  ca,  cc, -bj,  bt,  cj, -bq,  bm, -cf, -bx,  bf, -by, -ce,  bl, -br,  ck,  bs, -bk,  cd,  bz, -bg,  bw,  cg, -bn,  bp, -ci, -bu,  bi, -cb }, \
+  { ba, -ar,  av, -be, -aw,  aq, -az, -bb,  as, -au,  bd,  ax, -ap,  ay,  bc, -at,  at, -bc, -ay,  ap, -ax, -bd,  au, -as,  bb,  az, -aq,  aw,  be, -av,  ar, -ba, -ba,  ar, -av,  be,  aw, -aq,  az,  bb, -as,  au, -bd, -ax,  ap, -ay, -bc,  at, -at,  bc,  ay, -ap,  ax,  bd, -au,  as, -bb, -az,  aq, -aw, -be,  av, -ar,  ba }, \
+  { cc, -bl,  bp, -cg, -by,  bh, -bt,  ck,  bu, -bg,  bx,  ch, -bq,  bk, -cb, -cd,  bm, -bo,  cf,  bz, -bi,  bs, -cj, -bv,  bf, -bw, -ci,  br, -bj,  ca,  ce, -bn,  bn, -ce, -ca,  bj, -br,  ci,  bw, -bf,  bv,  cj, -bs,  bi, -bz, -cf,  bo, -bm,  cd,  cb, -bk,  bq, -ch, -bx,  bg, -bu, -ck,  bt, -bh,  by,  cg, -bp,  bl, -cc }, \
+  { ac, -ab,  ab, -ac, -ac,  ab, -ab,  ac,  ac, -ab,  ab, -ac, -ac,  ab, -ab,  ac,  ac, -ab,  ab, -ac, -ac,  ab, -ab,  ac,  ac, -ab,  ab, -ac, -ac,  ab, -ab,  ac,  ac, -ab,  ab, -ac, -ac,  ab, -ab,  ac,  ac, -ab,  ab, -ac, -ac,  ab, -ab,  ac,  ac, -ab,  ab, -ac, -ac,  ab, -ab,  ac,  ac, -ab,  ab, -ac, -ac,  ab, -ab,  ac }, \
+  { cd, -bo,  bk, -bz, -ch,  bs, -bg,  bv, -ck, -bw,  bh, -br,  cg,  ca, -bl,  bn, -cc, -ce,  bp, -bj,  by,  ci, -bt,  bf, -bu,  cj,  bx, -bi,  bq, -cf, -cb,  bm, -bm,  cb,  cf, -bq,  bi, -bx, -cj,  bu, -bf,  bt, -ci, -by,  bj, -bp,  ce,  cc, -bn,  bl, -ca, -cg,  br, -bh,  bw,  ck, -bv,  bg, -bs,  ch,  bz, -bk,  bo, -cd }, \
+  { bb, -au,  aq, -ax,  be,  ay, -ar,  at, -ba, -bc,  av, -ap,  aw, -bd, -az,  as, -as,  az,  bd, -aw,  ap, -av,  bc,  ba, -at,  ar, -ay, -be,  ax, -aq,  au, -bb, -bb,  au, -aq,  ax, -be, -ay,  ar, -at,  ba,  bc, -av,  ap, -aw,  bd,  az, -as,  as, -az, -bd,  aw, -ap,  av, -bc, -ba,  at, -ar,  ay,  be, -ax,  aq, -au,  bb }, \
+  { ce, -br,  bf, -bs,  cf,  cd, -bq,  bg, -bt,  cg,  cc, -bp,  bh, -bu,  ch,  cb, -bo,  bi, -bv,  ci,  ca, -bn,  bj, -bw,  cj,  bz, -bm,  bk, -bx,  ck,  by, -bl,  bl, -by, -ck,  bx, -bk,  bm, -bz, -cj,  bw, -bj,  bn, -ca, -ci,  bv, -bi,  bo, -cb, -ch,  bu, -bh,  bp, -cc, -cg,  bt, -bg,  bq, -cd, -cf,  bs, -bf,  br, -ce }, \
+  { an, -ak,  ah, -aj,  am,  ao, -al,  ai, -ai,  al, -ao, -am,  aj, -ah,  ak, -an, -an,  ak, -ah,  aj, -am, -ao,  al, -ai,  ai, -al,  ao,  am, -aj,  ah, -ak,  an,  an, -ak,  ah, -aj,  am,  ao, -al,  ai, -ai,  al, -ao, -am,  aj, -ah,  ak, -an, -an,  ak, -ah,  aj, -am, -ao,  al, -ai,  ai, -al,  ao,  am, -aj,  ah, -ak,  an }, \
+  { cf, -bu,  bj, -bl,  bw, -ch, -cd,  bs, -bh,  bn, -by,  cj,  cb, -bq,  bf, -bp,  ca,  ck, -bz,  bo, -bg,  br, -cc, -ci,  bx, -bm,  bi, -bt,  ce,  cg, -bv,  bk, -bk,  bv, -cg, -ce,  bt, -bi,  bm, -bx,  ci,  cc, -br,  bg, -bo,  bz, -ck, -ca,  bp, -bf,  bq, -cb, -cj,  by, -bn,  bh, -bs,  cd,  ch, -bw,  bl, -bj,  bu, -cf }, \
+  { bc, -ax,  as, -aq,  av, -ba, -be,  az, -au,  ap, -at,  ay, -bd, -bb,  aw, -ar,  ar, -aw,  bb,  bd, -ay,  at, -ap,  au, -az,  be,  ba, -av,  aq, -as,  ax, -bc, -bc,  ax, -as,  aq, -av,  ba,  be, -az,  au, -ap,  at, -ay,  bd,  bb, -aw,  ar, -ar,  aw, -bb, -bd,  ay, -at,  ap, -au,  az, -be, -ba,  av, -aq,  as, -ax,  bc }, \
+  { cg, -bx,  bo, -bf,  bn, -bw,  cf,  ch, -by,  bp, -bg,  bm, -bv,  ce,  ci, -bz,  bq, -bh,  bl, -bu,  cd,  cj, -ca,  br, -bi,  bk, -bt,  cc,  ck, -cb,  bs, -bj,  bj, -bs,  cb, -ck, -cc,  bt, -bk,  bi, -br,  ca, -cj, -cd,  bu, -bl,  bh, -bq,  bz, -ci, -ce,  bv, -bm,  bg, -bp,  by, -ch, -cf,  bw, -bn,  bf, -bo,  bx, -cg }, \
+  { ag, -af,  ae, -ad,  ad, -ae,  af, -ag, -ag,  af, -ae,  ad, -ad,  ae, -af,  ag,  ag, -af,  ae, -ad,  ad, -ae,  af, -ag, -ag,  af, -ae,  ad, -ad,  ae, -af,  ag,  ag, -af,  ae, -ad,  ad, -ae,  af, -ag, -ag,  af, -ae,  ad, -ad,  ae, -af,  ag,  ag, -af,  ae, -ad,  ad, -ae,  af, -ag, -ag,  af, -ae,  ad, -ad,  ae, -af,  ag }, \
+  { ch, -ca,  bt, -bm,  bf, -bl,  bs, -bz,  cg,  ci, -cb,  bu, -bn,  bg, -bk,  br, -by,  cf,  cj, -cc,  bv, -bo,  bh, -bj,  bq, -bx,  ce,  ck, -cd,  bw, -bp,  bi, -bi,  bp, -bw,  cd, -ck, -ce,  bx, -bq,  bj, -bh,  bo, -bv,  cc, -cj, -cf,  by, -br,  bk, -bg,  bn, -bu,  cb, -ci, -cg,  bz, -bs,  bl, -bf,  bm, -bt,  ca, -ch }, \
+  { bd, -ba,  ax, -au,  ar, -ap,  as, -av,  ay, -bb,  be,  bc, -az,  aw, -at,  aq, -aq,  at, -aw,  az, -bc, -be,  bb, -ay,  av, -as,  ap, -ar,  au, -ax,  ba, -bd, -bd,  ba, -ax,  au, -ar,  ap, -as,  av, -ay,  bb, -be, -bc,  az, -aw,  at, -aq,  aq, -at,  aw, -az,  bc,  be, -bb,  ay, -av,  as, -ap,  ar, -au,  ax, -ba,  bd }, \
+  { ci, -cd,  by, -bt,  bo, -bj,  bf, -bk,  bp, -bu,  bz, -ce,  cj,  ch, -cc,  bx, -bs,  bn, -bi,  bg, -bl,  bq, -bv,  ca, -cf,  ck,  cg, -cb,  bw, -br,  bm, -bh,  bh, -bm,  br, -bw,  cb, -cg, -ck,  cf, -ca,  bv, -bq,  bl, -bg,  bi, -bn,  bs, -bx,  cc, -ch, -cj,  ce, -bz,  bu, -bp,  bk, -bf,  bj, -bo,  bt, -by,  cd, -ci }, \
+  { ao, -an,  am, -al,  ak, -aj,  ai, -ah,  ah, -ai,  aj, -ak,  al, -am,  an, -ao, -ao,  an, -am,  al, -ak,  aj, -ai,  ah, -ah,  ai, -aj,  ak, -al,  am, -an,  ao,  ao, -an,  am, -al,  ak, -aj,  ai, -ah,  ah, -ai,  aj, -ak,  al, -am,  an, -ao, -ao,  an, -am,  al, -ak,  aj, -ai,  ah, -ah,  ai, -aj,  ak, -al,  am, -an,  ao }, \
+  { cj, -cg,  cd, -ca,  bx, -bu,  br, -bo,  bl, -bi,  bf, -bh,  bk, -bn,  bq, -bt,  bw, -bz,  cc, -cf,  ci,  ck, -ch,  ce, -cb,  by, -bv,  bs, -bp,  bm, -bj,  bg, -bg,  bj, -bm,  bp, -bs,  bv, -by,  cb, -ce,  ch, -ck, -ci,  cf, -cc,  bz, -bw,  bt, -bq,  bn, -bk,  bh, -bf,  bi, -bl,  bo, -br,  bu, -bx,  ca, -cd,  cg, -cj }, \
+  { be, -bd,  bc, -bb,  ba, -az,  ay, -ax,  aw, -av,  au, -at,  as, -ar,  aq, -ap,  ap, -aq,  ar, -as,  at, -au,  av, -aw,  ax, -ay,  az, -ba,  bb, -bc,  bd, -be, -be,  bd, -bc,  bb, -ba,  az, -ay,  ax, -aw,  av, -au,  at, -as,  ar, -aq,  ap, -ap,  aq, -ar,  as, -at,  au, -av,  aw, -ax,  ay, -az,  ba, -bb,  bc, -bd,  be }, \
+  { ck, -cj,  ci, -ch,  cg, -cf,  ce, -cd,  cc, -cb,  ca, -bz,  by, -bx,  bw, -bv,  bu, -bt,  bs, -br,  bq, -bp,  bo, -bn,  bm, -bl,  bk, -bj,  bi, -bh,  bg, -bf,  bf, -bg,  bh, -bi,  bj, -bk,  bl, -bm,  bn, -bo,  bp, -bq,  br, -bs,  bt, -bu,  bv, -bw,  bx, -by,  bz, -ca,  cb, -cc,  cd, -ce,  cf, -cg,  ch, -ci,  cj, -ck }, \
+ }
+
+// DCT-8
+#define DEFINE_DCT8_P4_MATRIX(a,b,c,d) \
+{ \
+  {  a,  b,  c,  d,}, \
+  {  b,  0, -b, -b,}, \
+  {  c, -b, -d,  a,}, \
+  {  d, -b,  a, -c,}, \
+}
+
+#define DEFINE_DCT8_P8_MATRIX(a,b,c,d,e,f,g,h) \
+{ \
+  {  a,  b,  c,  d,  e,  f,  g,  h,}, \
+  {  b,  e,  h, -g, -d, -a, -c, -f,}, \
+  {  c,  h, -e, -a, -f,  g,  b,  d,}, \
+  {  d, -g, -a, -h,  c,  e, -f, -b,}, \
+  {  e, -d, -f,  c,  g, -b, -h,  a,}, \
+  {  f, -a,  g,  e, -b,  h,  d, -c,}, \
+  {  g, -c,  b, -f, -h,  d, -a,  e,}, \
+  {  h, -f,  d, -b,  a, -c,  e, -g,}, \
+}
+
+#define DEFINE_DCT8_P16_MATRIX(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
+{ \
+  {  a,  b,  c,  d,  e,  f,  g,  h,  i,  j,  k,  l,  m,  n,  o,  p,}, \
+  {  b,  e,  h,  k,  n,  0, -n, -k, -h, -e, -b, -b, -e, -h, -k, -n,}, \
+  {  c,  h,  m, -p, -k, -f, -a, -e, -j, -o,  n,  i,  d,  b,  g,  l,}, \
+  {  d,  k, -p, -i, -b, -f, -m,  n,  g,  a,  h,  o, -l, -e, -c, -j,}, \
+  {  e,  n, -k, -b, -h,  0,  h,  b,  k, -n, -e, -e, -n,  k,  b,  h,}, \
+  {  f,  0, -f, -f,  0,  f,  f,  0, -f, -f,  0,  f,  f,  0, -f, -f,}, \
+  {  g, -n, -a, -m,  h,  f, -o, -b, -l,  i,  e, -p, -c, -k,  j,  d,}, \
+  {  h, -k, -e,  n,  b,  0, -b, -n,  e,  k, -h, -h,  k,  e, -n, -b,}, \
+  {  i, -h, -j,  g,  k, -f, -l,  e,  m, -d, -n,  c,  o, -b, -p,  a,}, \
+  {  j, -e, -o,  a, -n, -f,  i,  k, -d, -p,  b, -m, -g,  h,  l, -c,}, \
+  {  k, -b,  n,  h, -e,  0,  e, -h, -n,  b, -k, -k,  b, -n, -h,  e,}, \
+  {  l, -b,  i,  o, -e,  f, -p, -h,  c, -m, -k,  a, -j, -n,  d, -g,}, \
+  {  m, -e,  d, -l, -n,  f, -c,  k,  o, -g,  b, -j, -p,  h, -a,  i,}, \
+  {  n, -h,  b, -e,  k,  0, -k,  e, -b,  h, -n, -n,  h, -b,  e, -k,}, \
+  {  o, -k,  g, -c,  b, -f,  j, -n, -p,  l, -h,  d, -a,  e, -i,  m,}, \
+  {  p, -n,  l, -j,  h, -f,  d, -b,  a, -c,  e, -g,  i, -k,  m, -o,}, \
+}
+
+#define DEFINE_DCT8_P32_MATRIX(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,A,B,C,D,E,F) \
+{ \
+  {  a,  b,  c,  d,  e,  f,  g,  h,  i,  j,  k,  l,  m,  n,  o,  p,  q,  r,  s,  t,  u,  v,  w,  x,  y,  z,  A,  B,  C,  D,  E,  F,}, \
+  {  b,  e,  h,  k,  n,  q,  t,  w,  z,  C,  F, -E, -B, -y, -v, -s, -p, -m, -j, -g, -d, -a, -c, -f, -i, -l, -o, -r, -u, -x, -A, -D,}, \
+  {  c,  h,  m,  r,  w,  B,  0, -B, -w, -r, -m, -h, -c, -c, -h, -m, -r, -w, -B,  0,  B,  w,  r,  m,  h,  c,  c,  h,  m,  r,  w,  B,}, \
+  {  d,  k,  r,  y,  F, -A, -t, -m, -f, -b, -i, -p, -w, -D,  C,  v,  o,  h,  a,  g,  n,  u,  B, -E, -x, -q, -j, -c, -e, -l, -s, -z,}, \
+  {  e,  n,  w,  F, -y, -p, -g, -c, -l, -u, -D,  A,  r,  i,  a,  j,  s,  B, -C, -t, -k, -b, -h, -q, -z,  E,  v,  m,  d,  f,  o,  x,}, \
+  {  f,  q,  B, -A, -p, -e, -g, -r, -C,  z,  o,  d,  h,  s,  D, -y, -n, -c, -i, -t, -E,  x,  m,  b,  j,  u,  F, -w, -l, -a, -k, -v,}, \
+  {  g,  t,  0, -t, -g, -g, -t,  0,  t,  g,  g,  t,  0, -t, -g, -g, -t,  0,  t,  g,  g,  t,  0, -t, -g, -g, -t,  0,  t,  g,  g,  t,}, \
+  {  h,  w, -B, -m, -c, -r,  0,  r,  c,  m,  B, -w, -h, -h, -w,  B,  m,  c,  r,  0, -r, -c, -m, -B,  w,  h,  h,  w, -B, -m, -c, -r,}, \
+  {  i,  z, -w, -f, -l, -C,  t,  c,  o,  F, -q, -a, -r,  E,  n,  d,  u, -B, -k, -g, -x,  y,  h,  j,  A, -v, -e, -m, -D,  s,  b,  p,}, \
+  {  j,  C, -r, -b, -u,  z,  g,  m,  F, -o, -e, -x,  w,  d,  p, -E, -l, -h, -A,  t,  a,  s, -B, -i, -k, -D,  q,  c,  v, -y, -f, -n,}, \
+  {  k,  F, -m, -i, -D,  o,  g,  B, -q, -e, -z,  s,  c,  x, -u, -a, -v,  w,  b,  t, -y, -d, -r,  A,  f,  p, -C, -h, -n,  E,  j,  l,}, \
+  {  l, -E, -h, -p,  A,  d,  t, -w, -a, -x,  s,  e,  B, -o, -i, -F,  k,  m, -D, -g, -q,  z,  c,  u, -v, -b, -y,  r,  f,  C, -n, -j,}, \
+  {  m, -B, -c, -w,  r,  h,  0, -h, -r,  w,  c,  B, -m, -m,  B,  c,  w, -r, -h,  0,  h,  r, -w, -c, -B,  m,  m, -B, -c, -w,  r,  h,}, \
+  {  n, -y, -c, -D,  i,  s, -t, -h,  E,  d,  x, -o, -m,  z,  b,  C, -j, -r,  u,  g, -F, -e, -w,  p,  l, -A, -a, -B,  k,  q, -v, -f,}, \
+  {  o, -v, -h,  C,  a,  D, -g, -w,  n,  p, -u, -i,  B,  b,  E, -f, -x,  m,  q, -t, -j,  A,  c,  F, -e, -y,  l,  r, -s, -k,  z,  d,}, \
+  {  p, -s, -m,  v,  j, -y, -g,  B,  d, -E, -a, -F,  c,  C, -f, -z,  i,  w, -l, -t,  o,  q, -r, -n,  u,  k, -x, -h,  A,  e, -D, -b,}, \
+  {  q, -p, -r,  o,  s, -n, -t,  m,  u, -l, -v,  k,  w, -j, -x,  i,  y, -h, -z,  g,  A, -f, -B,  e,  C, -d, -D,  c,  E, -b, -F,  a,}, \
+  {  r, -m, -w,  h,  B, -c,  0,  c, -B, -h,  w,  m, -r, -r,  m,  w, -h, -B,  c,  0, -c,  B,  h, -w, -m,  r,  r, -m, -w,  h,  B, -c,}, \
+  {  s, -j, -B,  a, -C, -i,  t,  r, -k, -A,  b, -D, -h,  u,  q, -l, -z,  c, -E, -g,  v,  p, -m, -y,  d, -F, -f,  w,  o, -n, -x,  e,}, \
+  {  t, -g,  0,  g, -t, -t,  g,  0, -g,  t,  t, -g,  0,  g, -t, -t,  g,  0, -g,  t,  t, -g,  0,  g, -t, -t,  g,  0, -g,  t,  t, -g,}, \
+  {  u, -d,  B,  n, -k, -E,  g, -r, -x,  a, -y, -q,  h, -F, -j,  o,  A, -c,  v,  t, -e,  C,  m, -l, -D,  f, -s, -w,  b, -z, -p,  i,}, \
+  {  v, -a,  w,  u, -b,  x,  t, -c,  y,  s, -d,  z,  r, -e,  A,  q, -f,  B,  p, -g,  C,  o, -h,  D,  n, -i,  E,  m, -j,  F,  l, -k,}, \
+  {  w, -c,  r,  B, -h,  m,  0, -m,  h, -B, -r,  c, -w, -w,  c, -r, -B,  h, -m,  0,  m, -h,  B,  r, -c,  w,  w, -c,  r,  B, -h,  m,}, \
+  {  x, -f,  m, -E, -q,  b, -t, -B,  j, -i,  A,  u, -c,  p,  F, -n,  e, -w, -y,  g, -l,  D,  r, -a,  s,  C, -k,  h, -z, -v,  d, -o,}, \
+  {  y, -i,  h, -x, -z,  j, -g,  w,  A, -k,  f, -v, -B,  l, -e,  u,  C, -m,  d, -t, -D,  n, -c,  s,  E, -o,  b, -r, -F,  p, -a,  q,}, \
+  {  z, -l,  c, -q,  E,  u, -g,  h, -v, -D,  p, -b,  m, -A, -y,  k, -d,  r, -F, -t,  f, -i,  w,  C, -o,  a, -n,  B,  x, -j,  e, -s,}, \
+  {  A, -o,  c, -j,  v,  F, -t,  h, -e,  q, -C, -y,  m, -a,  l, -x, -D,  r, -f,  g, -s,  E,  w, -k,  b, -n,  z,  B, -p,  d, -i,  u,}, \
+  {  B, -r,  h, -c,  m, -w,  0,  w, -m,  c, -h,  r, -B, -B,  r, -h,  c, -m,  w,  0, -w,  m, -c,  h, -r,  B,  B, -r,  h, -c,  m, -w,}, \
+  {  C, -u,  m, -e,  d, -l,  t, -B, -D,  v, -n,  f, -c,  k, -s,  A,  E, -w,  o, -g,  b, -j,  r, -z, -F,  x, -p,  h, -a,  i, -q,  y,}, \
+  {  D, -x,  r, -l,  f, -a,  g, -m,  s, -y,  E,  C, -w,  q, -k,  e, -b,  h, -n,  t, -z,  F,  B, -v,  p, -j,  d, -c,  i, -o,  u, -A,}, \
+  {  E, -A,  w, -s,  o, -k,  g, -c,  b, -f,  j, -n,  r, -v,  z, -D, -F,  B, -x,  t, -p,  l, -h,  d, -a,  e, -i,  m, -q,  u, -y,  C,}, \
+  {  F, -D,  B, -z,  x, -v,  t, -r,  p, -n,  l, -j,  h, -f,  d, -b,  a, -c,  e, -g,  i, -k,  m, -o,  q, -s,  u, -w,  y, -A,  C, -E,}, \
+}
+
+
+// DST-7
+#define DEFINE_DST7_P4_MATRIX(a,b,c,d) \
+{ \
+  {  a,  b,  c,  d }, \
+  {  c,  c,  0, -c }, \
+  {  d, -a, -c,  b }, \
+  {  b, -d,  c, -a }, \
+}
+
+#define DEFINE_DST7_P8_MATRIX(a,b,c,d,e,f,g,h) \
+{ \
+  {  a,  b,  c,  d,  e,  f,  g,  h,}, \
+  {  c,  f,  h,  e,  b, -a, -d, -g,}, \
+  {  e,  g,  b, -c, -h, -d,  a,  f,}, \
+  {  g,  c, -d, -f,  a,  h,  b, -e,}, \
+  {  h, -a, -g,  b,  f, -c, -e,  d,}, \
+  {  f, -e, -a,  g, -d, -b,  h, -c,}, \
+  {  d, -h,  e, -a, -c,  g, -f,  b,}, \
+  {  b, -d,  f, -h,  g, -e,  c, -a,}, \
+}
+
+#define DEFINE_DST7_P16_MATRIX(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
+{ \
+  {  a,  b,  c,  d,  e,  f,  g,  h,  i,  j,  k,  l,  m,  n,  o,  p,}, \
+  {  c,  f,  i,  l,  o,  o,  l,  i,  f,  c,  0, -c, -f, -i, -l, -o,}, \
+  {  e,  j,  o,  m,  h,  c, -b, -g, -l, -p, -k, -f, -a,  d,  i,  n,}, \
+  {  g,  n,  l,  e, -b, -i, -p, -j, -c,  d,  k,  o,  h,  a, -f, -m,}, \
+  {  i,  o,  f, -c, -l, -l, -c,  f,  o,  i,  0, -i, -o, -f,  c,  l,}, \
+  {  k,  k,  0, -k, -k,  0,  k,  k,  0, -k, -k,  0,  k,  k,  0, -k,}, \
+  {  m,  g, -f, -n, -a,  l,  h, -e, -o, -b,  k,  i, -d, -p, -c,  j,}, \
+  {  o,  c, -l, -f,  i,  i, -f, -l,  c,  o,  0, -o, -c,  l,  f, -i,}, \
+  {  p, -a, -o,  b,  n, -c, -m,  d,  l, -e, -k,  f,  j, -g, -i,  h,}, \
+  {  n, -e, -i,  j,  d, -o,  a,  m, -f, -h,  k,  c, -p,  b,  l, -g,}, \
+  {  l, -i, -c,  o, -f, -f,  o, -c, -i,  l,  0, -l,  i,  c, -o,  f,}, \
+  {  j, -m,  c,  g, -p,  f,  d, -n,  i,  a, -k,  l, -b, -h,  o, -e,}, \
+  {  h, -p,  i, -a, -g,  o, -j,  b,  f, -n,  k, -c, -e,  m, -l,  d,}, \
+  {  f, -l,  o, -i,  c,  c, -i,  o, -l,  f,  0, -f,  l, -o,  i, -c,}, \
+  {  d, -h,  l, -p,  m, -i,  e, -a, -c,  g, -k,  o, -n,  j, -f,  b,}, \
+  {  b, -d,  f, -h,  j, -l,  n, -p,  o, -m,  k, -i,  g, -e,  c, -a,}, \
+}
+
+#define DEFINE_DST7_P32_MATRIX(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,A,B,C,D,E,F) \
+{ \
+  {  a,  b,  c,  d,  e,  f,  g,  h,  i,  j,  k,  l,  m,  n,  o,  p,  q,  r,  s,  t,  u,  v,  w,  x,  y,  z,  A,  B,  C,  D,  E,  F,}, \
+  {  c,  f,  i,  l,  o,  r,  u,  x,  A,  D,  F,  C,  z,  w,  t,  q,  n,  k,  h,  e,  b, -a, -d, -g, -j, -m, -p, -s, -v, -y, -B, -E,}, \
+  {  e,  j,  o,  t,  y,  D,  D,  y,  t,  o,  j,  e,  0, -e, -j, -o, -t, -y, -D, -D, -y, -t, -o, -j, -e,  0,  e,  j,  o,  t,  y,  D,}, \
+  {  g,  n,  u,  B,  D,  w,  p,  i,  b, -e, -l, -s, -z, -F, -y, -r, -k, -d,  c,  j,  q,  x,  E,  A,  t,  m,  f, -a, -h, -o, -v, -C,}, \
+  {  i,  r,  A,  C,  t,  k,  b, -g, -p, -y, -E, -v, -m, -d,  e,  n,  w,  F,  x,  o,  f, -c, -l, -u, -D, -z, -q, -h,  a,  j,  s,  B,}, \
+  {  k,  v,  F,  u,  j, -a, -l, -w, -E, -t, -i,  b,  m,  x,  D,  s,  h, -c, -n, -y, -C, -r, -g,  d,  o,  z,  B,  q,  f, -e, -p, -A,}, \
+  {  m,  z,  z,  m,  0, -m, -z, -z, -m,  0,  m,  z,  z,  m,  0, -m, -z, -z, -m,  0,  m,  z,  z,  m,  0, -m, -z, -z, -m,  0,  m,  z,}, \
+  {  o,  D,  t,  e, -j, -y, -y, -j,  e,  t,  D,  o,  0, -o, -D, -t, -e,  j,  y,  y,  j, -e, -t, -D, -o,  0,  o,  D,  t,  e, -j, -y,}, \
+  {  q,  E,  n, -c, -t, -B, -k,  f,  w,  y,  h, -i, -z, -v, -e,  l,  C,  s,  b, -o, -F, -p,  a,  r,  D,  m, -d, -u, -A, -j,  g,  x,}, \
+  {  s,  A,  h, -k, -D, -p,  c,  v,  x,  e, -n, -F, -m,  f,  y,  u,  b, -q, -C, -j,  i,  B,  r, -a, -t, -z, -g,  l,  E,  o, -d, -w,}, \
+  {  u,  w,  b, -s, -y, -d,  q,  A,  f, -o, -C, -h,  m,  E,  j, -k, -F, -l,  i,  D,  n, -g, -B, -p,  e,  z,  r, -c, -x, -t,  a,  v,}, \
+  {  w,  s, -d, -A, -o,  h,  E,  k, -l, -D, -g,  p,  z,  c, -t, -v,  a,  x,  r, -e, -B, -n,  i,  F,  j, -m, -C, -f,  q,  y,  b, -u,}, \
+  {  y,  o, -j, -D, -e,  t,  t, -e, -D, -j,  o,  y,  0, -y, -o,  j,  D,  e, -t, -t,  e,  D,  j, -o, -y,  0,  y,  o, -j, -D, -e,  t,}, \
+  {  A,  k, -p, -v,  e,  F,  f, -u, -q,  j,  B,  a, -z, -l,  o,  w, -d, -E, -g,  t,  r, -i, -C, -b,  y,  m, -n, -x,  c,  D,  h, -s,}, \
+  {  C,  g, -v, -n,  o,  u, -h, -B,  a,  D,  f, -w, -m,  p,  t, -i, -A,  b,  E,  e, -x, -l,  q,  s, -j, -z,  c,  F,  d, -y, -k,  r,}, \
+  {  E,  c, -B, -f,  y,  i, -v, -l,  s,  o, -p, -r,  m,  u, -j, -x,  g,  A, -d, -D,  a,  F,  b, -C, -e,  z,  h, -w, -k,  t,  n, -q,}, \
+  {  F, -a, -E,  b,  D, -c, -C,  d,  B, -e, -A,  f,  z, -g, -y,  h,  x, -i, -w,  j,  v, -k, -u,  l,  t, -m, -s,  n,  r, -o, -q,  p,}, \
+  {  D, -e, -y,  j,  t, -o, -o,  t,  j, -y, -e,  D,  0, -D,  e,  y, -j, -t,  o,  o, -t, -j,  y,  e, -D,  0,  D, -e, -y,  j,  t, -o,}, \
+  {  B, -i, -s,  r,  j, -A, -a,  C, -h, -t,  q,  k, -z, -b,  D, -g, -u,  p,  l, -y, -c,  E, -f, -v,  o,  m, -x, -d,  F, -e, -w,  n,}, \
+  {  z, -m, -m,  z,  0, -z,  m,  m, -z,  0,  z, -m, -m,  z,  0, -z,  m,  m, -z,  0,  z, -m, -m,  z,  0, -z,  m,  m, -z,  0,  z, -m,}, \
+  {  x, -q, -g,  E, -j, -n,  A, -c, -u,  t,  d, -B,  m,  k, -D,  f,  r, -w, -a,  y, -p, -h,  F, -i, -o,  z, -b, -v,  s,  e, -C,  l,}, \
+  {  v, -u, -a,  w, -t, -b,  x, -s, -c,  y, -r, -d,  z, -q, -e,  A, -p, -f,  B, -o, -g,  C, -n, -h,  D, -m, -i,  E, -l, -j,  F, -k,}, \
+  {  t, -y,  e,  o, -D,  j,  j, -D,  o,  e, -y,  t,  0, -t,  y, -e, -o,  D, -j, -j,  D, -o, -e,  y, -t,  0,  t, -y,  e,  o, -D,  j,}, \
+  {  r, -C,  k,  g, -y,  v, -d, -n,  F, -o, -c,  u, -z,  h,  j, -B,  s, -a, -q,  D, -l, -f,  x, -w,  e,  m, -E,  p,  b, -t,  A, -i,}, \
+  {  p, -F,  q, -a, -o,  E, -r,  b,  n, -D,  s, -c, -m,  C, -t,  d,  l, -B,  u, -e, -k,  A, -v,  f,  j, -z,  w, -g, -i,  y, -x,  h,}, \
+  {  n, -B,  w, -i, -e,  s, -F,  r, -d, -j,  x, -A,  m,  a, -o,  C, -v,  h,  f, -t,  E, -q,  c,  k, -y,  z, -l, -b,  p, -D,  u, -g,}, \
+  {  l, -x,  C, -q,  e,  g, -s,  E, -v,  j,  b, -n,  z, -A,  o, -c, -i,  u, -F,  t, -h, -d,  p, -B,  y, -m,  a,  k, -w,  D, -r,  f,}, \
+  {  j, -t,  D, -y,  o, -e, -e,  o, -y,  D, -t,  j,  0, -j,  t, -D,  y, -o,  e,  e, -o,  y, -D,  t, -j,  0,  j, -t,  D, -y,  o, -e,}, \
+  {  h, -p,  x, -F,  y, -q,  i, -a, -g,  o, -w,  E, -z,  r, -j,  b,  f, -n,  v, -D,  A, -s,  k, -c, -e,  m, -u,  C, -B,  t, -l,  d,}, \
+  {  f, -l,  r, -x,  D, -C,  w, -q,  k, -e, -a,  g, -m,  s, -y,  E, -B,  v, -p,  j, -d, -b,  h, -n,  t, -z,  F, -A,  u, -o,  i, -c,}, \
+  {  d, -h,  l, -p,  t, -x,  B, -F,  C, -y,  u, -q,  m, -i,  e, -a, -c,  g, -k,  o, -s,  w, -A,  E, -D,  z, -v,  r, -n,  j, -f,  b,}, \
+  {  b, -d,  f, -h,  j, -l,  n, -p,  r, -t,  v, -x,  z, -B,  D, -F,  E, -C,  A, -y,  w, -u,  s, -q,  o, -m,  k, -i,  g, -e,  c, -a,}, \
+}
+
+#define TRANSFORM_NUMBER_OF_DIRECTIONS 1
+#define ALIGN_DATA(nBytes,v) __declspec(align(nBytes)) v
+#define MEMORY_ALIGN_DEF_SIZE       32  // for use with avx2 (256 bit)
+//--------------------------------------------------------------------------------------------------
+// DCT-2
+ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDCT2P2[TRANSFORM_NUMBER_OF_DIRECTIONS][2][2]) =
+{
+  DEFINE_DCT2_P2_MATRIX(64),
+  //DEFINE_DCT2_P2_MATRIX(64)
+};
+
+ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDCT2P4[TRANSFORM_NUMBER_OF_DIRECTIONS][4][4]) =
+{
+  DEFINE_DCT2_P4_MATRIX(64,    83,    36),
+  //DEFINE_DCT2_P4_MATRIX(64,    83,    36)
+};
+
+ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDCT2P8[TRANSFORM_NUMBER_OF_DIRECTIONS][8][8]) =
+{
+  DEFINE_DCT2_P8_MATRIX(64,    83,    36,    89,    75,    50,    18),
+  //DEFINE_DCT2_P8_MATRIX(64,    83,    36,    89,    75,    50,    18)
+};
+
+ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDCT2P16[TRANSFORM_NUMBER_OF_DIRECTIONS][16][16]) =
+{
+  DEFINE_DCT2_P16_MATRIX(64,    83,    36,    89,    75,    50,    18,    90,    87,    80,    70,    57,    43,    25,     9),
+  //DEFINE_DCT2_P16_MATRIX(64,    83,    36,    89,    75,    50,    18,    90,    87,    80,    70,    57,    43,    25,     9)
+};
+
+ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDCT2P32[TRANSFORM_NUMBER_OF_DIRECTIONS][32][32]) =
+{
+  DEFINE_DCT2_P32_MATRIX(64,    83,    36,    89,    75,    50,    18,    90,    87,    80,    70,    57,    43,    25,     9,    90,    90,    88,    85,    82,    78,    73,    67,    61,    54,    46,    38,    31,    22,    13,     4),
+  //DEFINE_DCT2_P32_MATRIX(64,    83,    36,    89,    75,    50,    18,    90,    87,    80,    70,    57,    43,    25,     9,    90,    90,    88,    85,    82,    78,    73,    67,    61,    54,    46,    38,    31,    22,    13,     4)
+};
+
+ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDCT2P64[TRANSFORM_NUMBER_OF_DIRECTIONS][64][64]) =
+{
+  DEFINE_DCT2_P64_MATRIX(64,    83,    36,    89,    75,    50,    18,    90,    87,    80,    70,    57,    43,    25,     9,    90,    90,    88,    85,    82,    78,    73,    67,    61,    54,    46,    38,    31,    22,    13,     4,    91,    90,    90,    90,    88,    87,    86,    84,    83,    81,    79,    77,    73,    71,    69,    65,    62,    59,    56,    52,    48,    44,    41,    37,    33,    28,    24,    20,    15,    11,     7,     2),
+  //DEFINE_DCT2_P64_MATRIX(64,    83,    36,    89,    75,    50,    18,    90,    87,    80,    70,    57,    43,    25,     9,    90,    90,    88,    85,    82,    78,    73,    67,    61,    54,    46,    38,    31,    22,    13,     4,    91,    90,    90,    90,    88,    87,    86,    84,    83,    81,    79,    77,    73,    71,    69,    65,    62,    59,    56,    52,    48,    44,    41,    37,    33,    28,    24,    20,    15,    11,     7,     2)
+};
+
+// DCT-8
+ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDCT8P4[TRANSFORM_NUMBER_OF_DIRECTIONS][4][4]) =
+{
+  DEFINE_DCT8_P4_MATRIX(84,     74,     55,     29),
+  //DEFINE_DCT8_P4_MATRIX(84,     74,     55,     29)
+};
+ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDCT8P8[TRANSFORM_NUMBER_OF_DIRECTIONS][8][8]) =
+{
+  DEFINE_DCT8_P8_MATRIX(86,     85,     78,     71,     60,     46,     32,     17),
+  //DEFINE_DCT8_P8_MATRIX(86,     85,     78,     71,     60,     46,     32,     17)
+};
+ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDCT8P16[TRANSFORM_NUMBER_OF_DIRECTIONS][16][16]) =
+{
+  DEFINE_DCT8_P16_MATRIX(88,     88,     87,     85,     81,     77,     73,     68,     62,     55,     48,     40,     33,     25,     17,      8),
+  //DEFINE_DCT8_P16_MATRIX(88,     88,     87,     85,     81,     77,     73,     68,     62,     55,     48,     40,     33,     25,     17,      8)
+};
+ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDCT8P32[TRANSFORM_NUMBER_OF_DIRECTIONS][32][32]) =
+{
+  DEFINE_DCT8_P32_MATRIX(90,     90,     89,     88,     87,     86,     85,     84,     82,     80,     78,     77,     74,     72,     68,     66,     63,     60,     56,     53,     50,     46,     42,     38,     34,     30,     26,     21,     17,     13,      9,      4),
+  //DEFINE_DCT8_P32_MATRIX(90,     90,     89,     88,     87,     86,     85,     84,     82,     80,     78,     77,     74,     72,     68,     66,     63,     60,     56,     53,     50,     46,     42,     38,     34,     30,     26,     21,     17,     13,      9,      4)
+};
+
+// DST-7
+ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDST7P4[TRANSFORM_NUMBER_OF_DIRECTIONS][4][4]) =
+{
+  DEFINE_DST7_P4_MATRIX(29,    55,    74,    84),
+  //DEFINE_DST7_P4_MATRIX(29,    55,    74,    84)
+};
+ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDST7P8[TRANSFORM_NUMBER_OF_DIRECTIONS][8][8]) =
+{
+  DEFINE_DST7_P8_MATRIX(17,    32,    46,    60,    71,    78,    85,    86),
+  //DEFINE_DST7_P8_MATRIX(17,    32,    46,    60,    71,    78,    85,    86)
+};
+ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDST7P16[TRANSFORM_NUMBER_OF_DIRECTIONS][16][16]) =
+{
+  DEFINE_DST7_P16_MATRIX(8,    17,    25,    33,    40,    48,    55,    62,    68,    73,    77,    81,    85,    87,    88,    88),
+  //DEFINE_DST7_P16_MATRIX(8,    17,    25,    33,    40,    48,    55,    62,    68,    73,    77,    81,    85,    87,    88,    88)
+};
+ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDST7P32[TRANSFORM_NUMBER_OF_DIRECTIONS][32][32]) =
+{
+  DEFINE_DST7_P32_MATRIX(4,     9,    13,    17,    21,    26,    30,    34,    38,    42,    46,    50,    53,    56,    60,    63,    66,    68,    72,    74,    77,    78,    80,    82,    84,    85,    86,    87,    88,    89,    90,    90),
+  //DEFINE_DST7_P32_MATRIX(4,     9,    13,    17,    21,    26,    30,    34,    38,    42,    46,    50,    53,    56,    60,    63,    66,    68,    72,    74,    77,    78,    80,    82,    84,    85,    86,    87,    88,    89,    90,    90)
+};
+
+//--------------------------------------------------------------------------------------------------
+
+static const int16_t* vvenc_matrix_coeffs[3][6] = {
+  {g_trCoreDCT2P2[0][0], g_trCoreDCT2P4[0][0], g_trCoreDCT2P8[0][0], g_trCoreDCT2P16[0][0], g_trCoreDCT2P32[0][0], g_trCoreDCT2P64[0][0]},
+  {NULL,  g_trCoreDCT8P4[0][0], g_trCoreDCT8P8[0][0], g_trCoreDCT8P16[0][0], g_trCoreDCT8P32[0][0], NULL},
+  {NULL,  g_trCoreDST7P4[0][0], g_trCoreDST7P8[0][0], g_trCoreDST7P16[0][0], g_trCoreDST7P32[0][0], NULL},
+};
+
+//! \}
+
+
+
+#endif DCT_AVX2_TABLES_H

From b78f9aff1725db2f6dd8fdb74305660174b543e5 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 26 Jul 2023 10:45:39 +0300
Subject: [PATCH 243/254] [avx2] Inverses work when ISP is not enabled

---
 src/strategies/avx2/dct-avx2.c        | 58 +++++++++++++++------------
 src/strategies/avx2/dct_avx2_tables.h | 40 ++++++++++++++----
 2 files changed, 64 insertions(+), 34 deletions(-)

diff --git a/src/strategies/avx2/dct-avx2.c b/src/strategies/avx2/dct-avx2.c
index f875a581..71361feb 100644
--- a/src/strategies/avx2/dct-avx2.c
+++ b/src/strategies/avx2/dct-avx2.c
@@ -2752,9 +2752,9 @@ void fast_inverse_tr_4x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, t
   }
 
   __m256i v_hor_pass_out;
-  fast_inverse_tr_4x4_avx2_hor(src, &v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);
+  fast_inverse_tr_4x4_avx2_hor(src, &v_hor_pass_out, ver_coeff, shift_1st, height, 0, skip_width);
 
-  fast_inverse_tr_4x4_avx2_ver(&v_hor_pass_out, dst, ver_coeff, shift_2nd, width, skip_width, skip_height);
+  fast_inverse_tr_4x4_avx2_ver(&v_hor_pass_out, dst, hor_coeff, shift_2nd, width, skip_width, skip_height);
 }
 
 
@@ -3568,39 +3568,46 @@ static void fast_inverse_tr_8x2_avx2_ver(const int16_t* src, __m256i* dst, const
 
 static void fast_inverse_tr_8x2_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
 {
-  const int32_t    add = (shift > 0) ? (1 << (shift - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const int32_t    add = (shift > 0) ? (1 << (shift - 1)) : 0;
   const __m256i debias = _mm256_set1_epi32(add);
 
   const __m256i* v_coeff = (const __m256i*)coeff;
   const __m256i v_shuffle1 = _mm256_load_si256((const __m256i*)fi_tr_2x8_result_shuffle1_ver);
   const __m256i v_shuffle2 = _mm256_load_si256((const __m256i*)fi_tr_2x8_result_shuffle2_ver);
 
-  __m256i v_madd_0 = _mm256_madd_epi16(src[0], v_coeff[0]);
-  __m256i v_madd_1 = _mm256_madd_epi16(src[0], v_coeff[1]);
-  __m256i v_madd_2 = _mm256_madd_epi16(src[0], v_coeff[2]);
-  __m256i v_madd_3 = _mm256_madd_epi16(src[0], v_coeff[3]);
-  __m256i v_madd_4 = _mm256_madd_epi16(src[0], v_coeff[4]);
-  __m256i v_madd_5 = _mm256_madd_epi16(src[0], v_coeff[5]);
-  __m256i v_madd_6 = _mm256_madd_epi16(src[0], v_coeff[6]);
-  __m256i v_madd_7 = _mm256_madd_epi16(src[0], v_coeff[7]);
+  // Duplicate sources to enable vertical addition
+  __m256i v_src_0 = _mm256_permute4x64_epi64(*src, _MM_SHUFFLE(1, 1, 0, 0));
+  __m256i v_src_1 = _mm256_permute4x64_epi64(*src, _MM_SHUFFLE(3, 3, 2, 2));
 
-  __m256i v_add_0 = _mm256_add_epi32(v_madd_0, v_madd_1);
-  __m256i v_add_1 = _mm256_add_epi32(v_madd_2, v_madd_3);
-  __m256i v_add_2 = _mm256_add_epi32(v_madd_4, v_madd_5);
-  __m256i v_add_3 = _mm256_add_epi32(v_madd_6, v_madd_7);
+  __m256i v_madd_00 = _mm256_madd_epi16(v_src_0, v_coeff[0]);
+  __m256i v_madd_01 = _mm256_madd_epi16(v_src_1, v_coeff[1]);
+  
+  __m256i v_madd_10 = _mm256_madd_epi16(v_src_0, v_coeff[2]);
+  __m256i v_madd_11 = _mm256_madd_epi16(v_src_1, v_coeff[3]);
+
+  __m256i v_madd_20 = _mm256_madd_epi16(v_src_0, v_coeff[4]);
+  __m256i v_madd_21 = _mm256_madd_epi16(v_src_1, v_coeff[5]);
+  
+  __m256i v_madd_30 = _mm256_madd_epi16(v_src_0, v_coeff[6]);
+  __m256i v_madd_31 = _mm256_madd_epi16(v_src_1, v_coeff[7]);
+
+  __m256i v_add_0 = _mm256_add_epi32(v_madd_00, v_madd_01);
+  __m256i v_add_1 = _mm256_add_epi32(v_madd_10, v_madd_11);
+  __m256i v_add_2 = _mm256_add_epi32(v_madd_20, v_madd_21);
+  __m256i v_add_3 = _mm256_add_epi32(v_madd_30, v_madd_31);
 
   __m256i v_trunc_0 = truncate_avx2(_mm256_hadd_epi32(v_add_0, v_add_1), debias, shift);
   __m256i v_trunc_1 = truncate_avx2(_mm256_hadd_epi32(v_add_2, v_add_3), debias, shift);
 
   __m256i v_result = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
-  v_result = _mm256_shuffle_epi8(v_result, v_shuffle1);
-  v_result = _mm256_permute4x64_epi64(v_result, _MM_SHUFFLE(3, 1, 2, 0));
-  v_result = _mm256_shuffle_epi8(v_result, v_shuffle2);
+  //v_result = _mm256_shuffle_epi8(v_result, v_shuffle1);
+  //v_result = _mm256_permute4x64_epi64(v_result, _MM_SHUFFLE(3, 1, 2, 0));
+  //v_result = _mm256_shuffle_epi8(v_result, v_shuffle2);
 
   _mm256_store_si256((__m256i*)dst, v_result);
 }
 
-void fast_inverse_tr_8x2_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+void fast_inverse_tr_8x2_avx2(const int16_t* src, int16_t* dst, const int mts_type)
 {
   const int width = 8;
   const int height = 2;
@@ -3617,11 +3624,10 @@ void fast_inverse_tr_8x2_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, t
 
   __m256i v_ver_pass_out;
   fast_inverse_tr_8x2_avx2_ver(src, &v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height);
-  
+
   fast_inverse_tr_8x2_avx2_hor(&v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width);
 }
 
-
 void fast_forward_tr_8x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
 {
   const int width = 8;
@@ -4062,9 +4068,9 @@ void fast_inverse_tr_8x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, t
   }
 
   __m256i v_hor_pass_out[4];
-  fast_inverse_tr_8x8_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);
+  fast_inverse_tr_8x8_avx2_hor(src, v_hor_pass_out, ver_coeff, shift_1st, height, 0, skip_width);
 
-  fast_inverse_tr_8x8_avx2_ver(v_hor_pass_out, dst, ver_coeff, shift_2nd, width, skip_width, skip_height);
+  fast_inverse_tr_8x8_avx2_ver(v_hor_pass_out, dst, hor_coeff, shift_2nd, width, skip_width, skip_height);
 }
 
 
@@ -5636,9 +5642,9 @@ void fast_inverse_tr_16x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor,
   }
 
   __m256i v_hor_pass_out[16];
-  fast_inverse_tr_16x16_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);
+  fast_inverse_tr_16x16_avx2_hor(src, v_hor_pass_out, ver_coeff, shift_1st, height, 0, skip_width);
 
-  fast_inverse_tr_16x16_avx2_ver(v_hor_pass_out, dst, ver_coeff, shift_2nd, width, skip_width, skip_height);
+  fast_inverse_tr_16x16_avx2_ver(v_hor_pass_out, dst, hor_coeff, shift_2nd, width, skip_width, skip_height);
 }
 
 
@@ -8152,7 +8158,7 @@ int uvg_strategy_register_dct_avx2(void* opaque, uint8_t bitdepth)
     success &= uvg_strategyselector_register(opaque, "idct_32x32", "avx2", 40, &matrix_idct_32x32_avx2);
 
     success &= uvg_strategyselector_register(opaque, "mts_dct", "avx2", 40, &mts_dct_avx2);
-    //success &= uvg_strategyselector_register(opaque, "mts_idct", "avx2", 40, &mts_idct_avx2);
+    success &= uvg_strategyselector_register(opaque, "mts_idct", "avx2", 40, &mts_idct_avx2);
 
   }
 #endif // UVG_BIT_DEPTH == 8
diff --git a/src/strategies/avx2/dct_avx2_tables.h b/src/strategies/avx2/dct_avx2_tables.h
index 2233916b..f56cb2cc 100644
--- a/src/strategies/avx2/dct_avx2_tables.h
+++ b/src/strategies/avx2/dct_avx2_tables.h
@@ -749,16 +749,40 @@ const int16_t ff_dst7_2x8_coeff_ver[128] = {
 
 
 ALIGNED(32) const int16_t  fi_dct2_2x8_coeff_ver[128] = {
- 64,  89,  83,  75,  64,  89,  83,  75,  64,  75,  36, -18,  64,  75,  36, -18,
- 64,  50,  36,  18,  64,  50,  36,  18, -64, -89, -83, -50, -64, -89, -83, -50,
- 64,  50, -36, -89,  64,  50, -36, -89,  64,  18, -83, -50,  64,  18, -83, -50,
--64,  18,  83,  75, -64,  18,  83,  75,  64,  75, -36, -89,  64,  75, -36, -89,
- 64, -18, -83,  50,  64, -18, -83,  50,  64, -50, -36,  89,  64, -50, -36,  89,
- 64, -75, -36,  89,  64, -75, -36,  89, -64, -18,  83, -75, -64, -18,  83, -75,
- 64, -75,  36,  18,  64, -75,  36,  18,  64, -89,  83, -75,  64, -89,  83, -75,
--64,  89, -83,  50, -64,  89, -83,  50,  64, -50,  36, -18,  64, -50,  36, -18,
+ 64,  89,  83,  75,  64,  75,  36, -18,  64,  89,  83,  75,  64,  75,  36, -18,
+ 64,  50,  36,  18, -64, -89, -83, -50,  64,  50,  36,  18, -64, -89, -83, -50,
+ 64,  50, -36, -89,  64,  18, -83, -50,  64,  50, -36, -89,  64,  18, -83, -50,
+-64,  18,  83,  75,  64,  75, -36, -89, -64,  18,  83,  75,  64,  75, -36, -89,
+ 64, -18, -83,  50,  64, -50, -36,  89,  64, -18, -83,  50,  64, -50, -36,  89,
+ 64, -75, -36,  89, -64, -18,  83, -75,  64, -75, -36,  89, -64, -18,  83, -75,
+ 64, -75,  36,  18,  64, -89,  83, -75,  64, -75,  36,  18,  64, -89,  83, -75,
+-64,  89, -83,  50,  64, -50,  36, -18, -64,  89, -83,  50,  64, -50,  36, -18,
 };
 
+ALIGNED(32) const int16_t  fi_dst7_2x8_coeff_ver[128] = {
+ 17,  46,  71,  85,  32,  78,  85,  46,  17,  46,  71,  85,  32,  78,  85,  46,
+ 86,  78,  60,  32, -17, -71, -86, -60,  86,  78,  60,  32, -17, -71, -86, -60,
+ 46,  86,  32, -60,  60,  71, -46, -78,  46,  86,  32, -60,  60,  71, -46, -78,
+-85, -17,  71,  78,  32,  85, -17, -86, -85, -17,  71,  78,  32,  85, -17, -86,
+ 71,  32, -86,  17,  78, -17, -60,  86,  71,  32, -86,  17,  78, -17, -60,  86,
+ 78, -60, -46,  85, -46, -32,  85, -71,  78, -60, -46,  85, -46, -32,  85, -71,
+ 85, -60,  17,  32,  86, -85,  78, -71,  85, -60,  17,  32,  86, -85,  78, -71,
+-71,  86, -78,  46,  60, -46,  32, -17, -71,  86, -78,  46,  60, -46,  32, -17,
+};
+
+ALIGNED(32) const int16_t  fi_dct8_2x8_coeff_ver[128] = {
+ 86,  85,  78,  71,  85,  60,  17, -32,  86,  85,  78,  71,  85,  60,  17, -32,
+ 60,  46,  32,  17, -71, -86, -78, -46,  60,  46,  32,  17, -71, -86, -78, -46,
+ 78,  17, -60, -86,  71, -32, -86, -17,  78,  17, -60, -86,  71, -32, -86, -17,
+-46,  32,  85,  71,  78,  60, -46, -85, -46,  32,  85,  71,  78,  60, -46, -85,
+ 60, -71, -46,  78,  46, -86,  32,  60,  60, -71, -46,  78,  46, -86,  32,  60,
+ 32, -85, -17,  86, -85,  17,  71, -78,  32, -85, -17,  86, -85,  17,  71, -78,
+ 32, -78,  85, -46,  17, -46,  71, -85,  32, -78,  85, -46,  17, -46,  71, -85,
+-17,  71, -86,  60,  86, -78,  60, -32, -17,  71, -86,  60,  86, -78,  60, -32,
+};
+
+
+
 ALIGNED(32) const int16_t  fi_dct2_2x16_coeff_ver[512] = {
  64,  90,  89,  87,  64,  90,  89,  87,  64,  57,  50,  43,  64,  57,  50,  43,  // 0
  83,  80,  75,  70,  83,  80,  75,  70,  36,  25,  18,   9,  36,  25,  18,   9,

From 13d4313e02562d6ee53bfb2a8cb906baf7879bb3 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 26 Jul 2023 14:05:04 +0300
Subject: [PATCH 244/254] [avx2] Mostly working

---
 src/strategies/avx2/dct-avx2.c        |  41 +++++++--
 src/strategies/avx2/dct_avx2_tables.h | 121 ++++++++++++++++++++++++++
 2 files changed, 156 insertions(+), 6 deletions(-)

diff --git a/src/strategies/avx2/dct-avx2.c b/src/strategies/avx2/dct-avx2.c
index 71361feb..d82d6415 100644
--- a/src/strategies/avx2/dct-avx2.c
+++ b/src/strategies/avx2/dct-avx2.c
@@ -2174,6 +2174,9 @@ void fast_inverse_tr_2x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, t
   const int32_t shift_2nd = INVERSE_SHIFT_2ND;
 
   const int16_t* ver_coeff = fi_dct2_8x2_coeff_hor; // TODO: rename
+  if (ver == DST7) {
+    ver_coeff = fi_dst7_8x2_coeff_hor;
+  }
   const int16_t* hor_coeff = fi_dct2_8x2_coeff_ver; // rename
   // No coeffs for DCT8 and DST7 transforms since they do not exist for this block size
 
@@ -2359,6 +2362,9 @@ void fast_inverse_tr_2x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor,
   const int32_t shift_2nd = INVERSE_SHIFT_2ND;
 
   const int16_t* ver_coeff = fi_dct2_16x2_coeff_hor; // TODO: rename
+  if (ver == DST7) {
+    ver_coeff = fi_dst7_16x2_coeff_hor;
+  }
   const int16_t* hor_coeff = fi_dct2_16x2_coeff_ver; // rename
   // No coeffs for DCT8 and DST7 transforms since they do not exist for this block size
 
@@ -3607,7 +3613,7 @@ static void fast_inverse_tr_8x2_avx2_hor(const __m256i* src, int16_t* dst, const
   _mm256_store_si256((__m256i*)dst, v_result);
 }
 
-void fast_inverse_tr_8x2_avx2(const int16_t* src, int16_t* dst, const int mts_type)
+void fast_inverse_tr_8x2_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
 {
   const int width = 8;
   const int height = 2;
@@ -3620,6 +3626,9 @@ void fast_inverse_tr_8x2_avx2(const int16_t* src, int16_t* dst, const int mts_ty
 
   const int16_t* ver_coeff = ff_dct2_2xN_coeff_hor; // TODO: rename
   const int16_t* hor_coeff = fi_dct2_2x8_coeff_ver; // rename
+  if (hor == DST7) {
+    hor_coeff = fi_dst7_2x8_coeff_ver;
+  }
   // Only dct2 transform is defined for this block size
 
   __m256i v_ver_pass_out;
@@ -4810,6 +4819,9 @@ void fast_inverse_tr_16x2_avx2(const int16_t* src, int16_t* dst, tr_type_t hor,
 
   const int16_t* ver_coeff = ff_dct2_2xN_coeff_hor; // TODO: rename
   const int16_t* hor_coeff = fi_dct2_2x16_coeff_ver; // rename
+  if (hor == DST7) {
+    hor_coeff = fi_dst7_2x16_coeff_ver;
+  }
   // DST7 and DCT8 are not defined for this block size
 
   __m256i v_ver_pass_out[2];
@@ -5455,7 +5467,7 @@ static void fast_inverse_tr_16x16_avx2_hor(const int16_t* src, __m256i* dst, con
   //  dst[i] = _mm256_packs_epi32(v_trunc_lo[i], v_trunc_hi[i]);
   //}
 
-  for (int j = 0; j < 16; ++j) {
+  for (int j = 0; j < line; ++j) {
     __m256i res_0 = _mm256_setzero_si256();
     __m256i res_1 = _mm256_setzero_si256();
 
@@ -7219,7 +7231,7 @@ void fast_inverse_tr_32x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor,
   }
 
   __m256i v_ver_pass_out[16];
-  if(ver == DCT2) {
+  if(ver == DCT2 || hor == DCT2) {
     fast_inverse_tr_32x8_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, height, 0, skip_width);
   }
   else {
@@ -8128,9 +8140,26 @@ static void mts_idct_avx2(
   else {
     const int log2_width_minus1  = uvg_g_convert_to_log2[width] - 1;
     const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
-
-    dct_full_pass* idct_func = idct_function_table[log2_width_minus1][log2_height_minus1];
-    idct_func(input, output, type_hor, type_ver);
+        if (height == 1) {
+      if (width == 16) {
+        fast_forward_DCT2_B16_avx2_hor(input, (__m256i*)output, type_hor == DCT2 ? fi_dct2_16x1_coeff_hor : fi_dst7_16x1_coeff_hor, 13, 1, 0, 0);
+        _mm256_store_si256((__m256i*)output, _mm256_permute4x64_epi64(_mm256_load_si256((__m256i*)output), _MM_SHUFFLE(3, 1, 2, 0)));
+      } else if (width == 32) {
+        fast_forward_DCT2_B32_avx2_hor(input, (__m256i*)output, fi_dct2_32xN_coeff_hor, 13, 1, 0, 0);        
+      }
+    }
+    else if (width == 1){
+      if (height == 16) {
+        fast_forward_DCT2_B16_avx2_hor(input, (__m256i*)output, type_ver == DCT2 ? fi_dct2_16x1_coeff_hor : fi_dst7_16x1_coeff_hor, 13, 1, 0, 0);
+        _mm256_store_si256((__m256i*)output, _mm256_permute4x64_epi64(_mm256_load_si256((__m256i*)output), _MM_SHUFFLE(3, 1, 2, 0)));
+      } else if (height == 32) {
+        fast_forward_DCT2_B32_avx2_hor(input, (__m256i*)output, fi_dct2_32xN_coeff_hor, 13, 1, 0, 0);        
+      }
+    }
+    else {
+      dct_full_pass* idct_func = idct_function_table[log2_width_minus1][log2_height_minus1];
+      idct_func(input, output, type_hor, type_ver);
+    }
   }
 }
 
diff --git a/src/strategies/avx2/dct_avx2_tables.h b/src/strategies/avx2/dct_avx2_tables.h
index f56cb2cc..946ab6b8 100644
--- a/src/strategies/avx2/dct_avx2_tables.h
+++ b/src/strategies/avx2/dct_avx2_tables.h
@@ -818,6 +818,41 @@ ALIGNED(32) const int16_t  fi_dct2_2x16_coeff_ver[512] = {
  83, -80,  75, -70,  83, -80,  75, -70,  36, -25,  18,  -9,  36, -25,  18,  -9,
 };
 
+ALIGNED(32) const int16_t fi_dst7_2x16_coeff_ver[512] = {
+  8,  25,  40,  55,   8,  25,  40,  55,  88,  87,  81,  73,  88,  87,  81,  73,  // 0
+ 68,  77,  85,  88,  68,  77,  85,  88,  62,  48,  33,  17,  62,  48,  33,  17, 
+ 17,  48,  73,  87,  17,  48,  73,  87,  -8, -40, -68, -85,  -8, -40, -68, -85, 
+ 88,  77,  55,  25,  88,  77,  55,  25, -88, -81, -62, -33, -88, -81, -62, -33, 
+ 25,  68,  88,  81,  25,  68,  88,  81, -88, -68, -25,  25, -88, -68, -25,  25, 
+ 48,   0, -48, -81,  48,   0, -48, -81,  68,  88,  81,  48,  68,  88,  81,  48, 
+ 33,  81,  85,  40,  33,  81,  85,  40,  17,  73,  88,  55,  17,  73,  88,  55, 
+-25, -77, -87, -48, -25, -77, -87, -48,  -8, -68, -88, -62,  -8, -68, -88, -62, 
+ 40,  88,  62, -17,  40,  88,  62, -17,  87,  33, -48, -88,  87,  33, -48, -88,  // 8
+-81, -77,  -8,  68, -81, -77,  -8,  68, -55,  25,  85,  73, -55,  25,  85,  73, 
+ 48,  88,  25, -68,  48,  88,  25, -68, -25, -88, -48,  48, -25, -88, -48,  48, 
+-81,   0,  81,  68, -81,   0,  81,  68,  88,  25, -68, -81,  88,  25, -68, -81, 
+ 55,  81, -17, -88,  55,  81, -17, -88, -85,   8,  88,  33, -85,   8,  88,  33, 
+-25,  77,  62, -48, -25,  77,  62, -48, -73, -68,  40,  87, -73, -68,  40,  87, 
+ 62,  68, -55, -73,  62,  68, -55, -73,  33,  85, -25, -87,  33,  85, -25, -87, 
+ 48,  77, -40, -81,  48,  77, -40, -81,  17,  88,  -8, -88,  17,  88,  -8, -88, 
+ 68,  48, -81, -25,  68,  48, -81, -25,  81, -48, -68,  68,  81, -48, -68,  68,  // 16
+ 88,   0, -88,  25,  88,   0, -88,  25,  48, -81, -25,  88,  48, -81, -25,  88, 
+ 73,  25, -88,  33,  73,  25, -88,  33, -40, -62,  81,   8, -40, -62,  81,   8, 
+ 68, -77, -17,  88,  68, -77, -17,  88, -87,  48,  55, -85, -87,  48,  55, -85, 
+ 77,   0, -77,  77,  77,   0, -77,  77, -77,  77,   0, -77, -77,  77,   0, -77, 
+  0, -77,  77,   0,   0, -77,  77,   0,  77,   0, -77,  77,  77,   0, -77,  77, 
+ 81, -25, -48,  88,  81, -25, -48,  88,  48,  25, -81,  81,  48,  25, -81,  81, 
+-68,   0,  68, -88, -68,   0,  68, -88, -25, -48,  88, -68, -25, -48,  88, -68, 
+ 85, -48,  -8,  62,  85, -48,  -8,  62,  73, -88,  68, -17,  73, -88,  68, -17,  // 24
+-88,  77, -33, -25, -88,  77, -33, -25, -40,  81, -87,  55, -40,  81, -87,  55, 
+ 87, -68,  33,   8,  87, -68,  33,   8, -55,  17,  25, -62, -55,  17,  25, -62, 
+-48,  77, -88,  81, -48,  77, -88,  81,  85, -88,  73, -40,  85, -88,  73, -40, 
+ 88, -81,  68, -48,  88, -81,  68, -48, -68,  81, -88,  88, -68,  81, -88,  88, 
+ 25,   0, -25,  48,  25,   0, -25,  48, -81,  68, -48,  25, -81,  68, -48,  25, 
+ 88, -88,  87, -85,  88, -88,  87, -85,  62, -55,  48, -40,  62, -55,  48, -40, 
+ 81, -77,  73, -68,  81, -77,  73, -68,  33, -25,  17,  -8,  33, -25,  17,  -8, 
+};
+
 ALIGNED(32) const int16_t  fi_dct2_2x32_coeff_ver[2048] = {
  64,  90,  90,  90,  89,  88,  87,  85,  64,  90,  90,  90,  89,  88,  87,  85,  // 0
  83,  82,  80,  78,  75,  73,  70,  67,  83,  82,  80,  78,  75,  73,  70,  67,
@@ -1346,6 +1381,17 @@ ALIGNED(32) const int16_t  fi_dct2_8x2_coeff_hor[128] = {
  64, -89,  83, -75,  64, -50,  36, -18,  64, -89,  83, -75,  64, -50,  36, -18,
 };
 
+ALIGNED(32) const int16_t  fi_dst7_8x2_coeff_hor[128] = {
+  17,  46,  71,  85,  86,  78,  60,  32, 17,  46,  71,  85,  86,  78,  60,  32,
+  32,  78,  85,  46, -17, -71, -86, -60, 32,  78,  85,  46, -17, -71, -86, -60,
+  46,  86,  32, -60, -85, -17,  71,  78, 46,  86,  32, -60, -85, -17,  71,  78,
+  60,  71, -46, -78,  32,  85, -17, -86, 60,  71, -46, -78,  32,  85, -17, -86,
+  71,  32, -86,  17,  78, -60, -46,  85, 71,  32, -86,  17,  78, -60, -46,  85,
+  78, -17, -60,  86, -46, -32,  85, -71, 78, -17, -60,  86, -46, -32,  85, -71,
+  85, -60,  17,  32, -71,  86, -78,  46, 85, -60,  17,  32, -71,  86, -78,  46,
+  86, -85,  78, -71,  60, -46,  32, -17, 86, -85,  78, -71,  60, -46,  32, -17,
+};
+
             const int16_t* fi_dct2_8x2_coeff_ver = ff_dct2_2xN_coeff_hor; // This is identical to existing table
 
 
@@ -2381,6 +2427,43 @@ ALIGNED(32) const int16_t  fi_dct2_16x2_coeff_hor[512] = {
             const int16_t* fi_dct2_16x2_coeff_ver = ff_dct2_2xN_coeff_hor; // This is identical to existing table
 
 
+
+ALIGNED(32) const int16_t  fi_dst7_16x2_coeff_hor[512] = {
+  8,  25,  40,  55,  68,  77,  85,  88,   8,  25,  40,  55,  68,  77,  85,  88,  // 0
+ 88,  87,  81,  73,  62,  48,  33,  17,  88,  87,  81,  73,  62,  48,  33,  17, 
+ 17,  48,  73,  87,  88,  77,  55,  25,  17,  48,  73,  87,  88,  77,  55,  25, 
+ -8, -40, -68, -85, -88, -81, -62, -33,  -8, -40, -68, -85, -88, -81, -62, -33, 
+ 25,  68,  88,  81,  48,   0, -48, -81,  25,  68,  88,  81,  48,   0, -48, -81, 
+-88, -68, -25,  25,  68,  88,  81,  48, -88, -68, -25,  25,  68,  88,  81,  48, 
+ 33,  81,  85,  40, -25, -77, -87, -48,  33,  81,  85,  40, -25, -77, -87, -48, 
+ 17,  73,  88,  55,  -8, -68, -88, -62,  17,  73,  88,  55,  -8, -68, -88, -62, 
+ 40,  88,  62, -17, -81, -77,  -8,  68,  40,  88,  62, -17, -81, -77,  -8,  68,  // 8
+ 87,  33, -48, -88, -55,  25,  85,  73,  87,  33, -48, -88, -55,  25,  85,  73, 
+ 48,  88,  25, -68, -81,   0,  81,  68,  48,  88,  25, -68, -81,   0,  81,  68, 
+-25, -88, -48,  48,  88,  25, -68, -81, -25, -88, -48,  48,  88,  25, -68, -81, 
+ 55,  81, -17, -88, -25,  77,  62, -48,  55,  81, -17, -88, -25,  77,  62, -48, 
+-85,   8,  88,  33, -73, -68,  40,  87, -85,   8,  88,  33, -73, -68,  40,  87, 
+ 62,  68, -55, -73,  48,  77, -40, -81,  62,  68, -55, -73,  48,  77, -40, -81, 
+ 33,  85, -25, -87,  17,  88,  -8, -88,  33,  85, -25, -87,  17,  88,  -8, -88, 
+ 68,  48, -81, -25,  88,   0, -88,  25,  68,  48, -81, -25,  88,   0, -88,  25,  // 16
+ 81, -48, -68,  68,  48, -81, -25,  88,  81, -48, -68,  68,  48, -81, -25,  88, 
+ 73,  25, -88,  33,  68, -77, -17,  88,  73,  25, -88,  33,  68, -77, -17,  88, 
+-40, -62,  81,   8, -87,  48,  55, -85, -40, -62,  81,   8, -87,  48,  55, -85, 
+ 77,   0, -77,  77,   0, -77,  77,   0,  77,   0, -77,  77,   0, -77,  77,   0, 
+-77,  77,   0, -77,  77,   0, -77,  77, -77,  77,   0, -77,  77,   0, -77,  77, 
+ 81, -25, -48,  88, -68,   0,  68, -88,  81, -25, -48,  88, -68,   0,  68, -88, 
+ 48,  25, -81,  81, -25, -48,  88, -68,  48,  25, -81,  81, -25, -48,  88, -68, 
+ 85, -48,  -8,  62, -88,  77, -33, -25,  85, -48,  -8,  62, -88,  77, -33, -25,  // 24
+ 73, -88,  68, -17, -40,  81, -87,  55,  73, -88,  68, -17, -40,  81, -87,  55, 
+ 87, -68,  33,   8, -48,  77, -88,  81,  87, -68,  33,   8, -48,  77, -88,  81, 
+-55,  17,  25, -62,  85, -88,  73, -40, -55,  17,  25, -62,  85, -88,  73, -40, 
+ 88, -81,  68, -48,  25,   0, -25,  48,  88, -81,  68, -48,  25,   0, -25,  48, 
+-68,  81, -88,  88, -81,  68, -48,  25, -68,  81, -88,  88, -81,  68, -48,  25, 
+ 88, -88,  87, -85,  81, -77,  73, -68,  88, -88,  87, -85,  81, -77,  73, -68, 
+ 62, -55,  48, -40,  33, -25,  17,  -8,  62, -55,  48, -40,  33, -25,  17,  -8, 
+};
+
+
 ALIGNED(32) const int16_t  ff_dct2_16x8_butterfly_o_row_coeff_hor[1024] = {
  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  // 0
  87, -87,  87, -87,  87, -87,  87, -87,  87, -87,  87, -87,  87, -87,  87, -87,
@@ -2881,6 +2964,44 @@ ALIGNED(32) const int16_t  fi_dst7_16x16_coeff_hor[256] = {
 -25,  88,  55, -85, -77,  77,  88, -68, -87,  55,  73, -40, -48,  25,  17,  -8,
 };
 
+ALIGNED(32) const int16_t  fi_dct2_16x1_coeff_hor[256] = {
+ 64,  90,  64,  87,  64,  80,  64,  70,  64,  57,  64,  43,  64,  25,  64,   9,  // 0
+ 89,  87,  75,  57,  50,   9,  18, -43, -18, -80, -50, -90, -75, -70, -89, -25,
+ 83,  80,  36,   9, -36, -70, -83, -87, -83, -25, -36,  57,  36,  90,  83,  43,
+ 75,  70, -18, -43, -89, -87, -50,   9,  50,  90,  89,  25,  18, -80, -75, -57,
+ 64,  57, -64, -80, -64, -25,  64,  90,  64,  -9, -64, -87, -64,  43,  64,  70,  // 8
+ 50,  43, -89, -90,  18,  57,  75,  25, -75, -87, -18,  70,  89,   9, -50, -80,
+ 36,  25, -83, -70,  83,  90, -36, -80, -36,  43,  83,   9, -83, -57,  36,  87,
+ 18,   9, -50, -25,  75,  43, -89, -57,  89,  70, -75, -80,  50,  87, -18, -90,
+ 64,  -9,  64, -25,  64, -43,  64, -57,  64, -70,  64, -80,  64, -87,  64, -90,
+-89,  25, -75,  70, -50,  90, -18,  80,  18,  43,  50,  -9,  75, -57,  89, -87,
+ 83, -43,  36, -90, -36, -57, -83,  25, -83,  87, -36,  70,  36,  -9,  83, -80,
+-75,  57,  18,  80,  89, -25,  50, -90, -50,  -9, -89,  87, -18,  43,  75, -70,
+ 64, -70, -64, -43, -64,  87,  64,   9,  64, -90, -64,  25, -64,  80,  64, -57,
+-50,  80,  89,  -9, -18, -70, -75,  87,  75, -25,  18, -57, -89,  90,  50, -43,
+ 36, -87, -83,  57,  83,  -9, -36, -43, -36,  80,  83, -90, -83,  70,  36, -25,
+-18,  90,  50, -87, -75,  80,  89, -70, -89,  57,  75, -43, -50,  25,  18,  -9,
+};
+
+ALIGNED(32) const int16_t  fi_dst7_16x1_coeff_hor[256] = {
+  8,  25,  17,  48,  25,  68,  33,  81,  40,  88,  48,  88,  55,  81,  62,  68,  // 0
+ 40,  55,  73,  87,  88,  81,  85,  40,  62, -17,  25, -68, -17, -88, -55, -73,
+ 68,  77,  88,  77,  48,   0, -25, -77, -81, -77, -81,   0, -25,  77,  48,  77,
+ 85,  88,  55,  25, -48, -81, -87, -48,  -8,  68,  81,  68,  62, -48, -40, -81,
+ 88,  87,  -8, -40, -88, -68,  17,  73,  87,  33, -25, -88, -85,   8,  33,  85,  // 8
+ 81,  73, -68, -85, -25,  25,  88,  55, -48, -88, -48,  48,  88,  33, -25, -87,
+ 62,  48, -88, -81,  68,  88,  -8, -68, -55,  25,  88,  25, -73, -68,  17,  88,
+ 33,  17, -62, -33,  81,  48, -88, -62,  85,  73, -68, -81,  40,  87,  -8, -88,
+ 68,  48,  73,  25,  77,   0,  81, -25,  85, -48,  87, -68,  88, -81,  88, -88,
+-81, -25, -88,  33, -77,  77, -48,  88,  -8,  62,  33,   8,  68, -48,  87, -85,
+ 88,   0,  68, -77,   0, -77, -68,   0, -88,  77, -48,  77,  25,   0,  81, -77,
+-88,  25, -17,  88,  77,   0,  68, -88, -33, -25, -88,  81, -25,  48,  73, -68,
+ 81, -48, -40, -62, -77,  77,  48,  25,  73, -88, -55,  17, -68,  81,  62, -55,
+-68,  68,  81,   8,   0, -77, -81,  81,  68, -17,  25, -62, -88,  88,  48, -40,
+ 48, -81, -87,  48,  77,   0, -25, -48, -40,  81,  85, -88, -81,  68,  33, -25,
+-25,  88,  55, -85, -77,  77,  88, -68, -87,  55,  73, -40, -48,  25,  17,  -8,
+};
+
 ALIGNED(32) const int16_t* fi_dct8_16x16_coeff_hor = ff_dct8_16x16_coeff_ver;
 
 

From 1f9955bdda838c461697c9b0a9a0d9b45b9f49ba Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 26 Jul 2023 15:20:33 +0300
Subject: [PATCH 245/254] [avx2] Fix compilation errors

---
 src/strategies/avx2/dct-avx2.c        | 50 +++++++-------
 src/strategies/avx2/dct_avx2_tables.h | 96 ---------------------------
 2 files changed, 25 insertions(+), 121 deletions(-)

diff --git a/src/strategies/avx2/dct-avx2.c b/src/strategies/avx2/dct-avx2.c
index d82d6415..566efba3 100644
--- a/src/strategies/avx2/dct-avx2.c
+++ b/src/strategies/avx2/dct-avx2.c
@@ -2198,9 +2198,9 @@ void fast_forward_tr_2x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor,
   const int32_t shift_2nd = log2_height_minus1 + 7;
 
   const int16_t* hor_coeff = ff_dct2_2xN_coeff_hor;
-  const int16_t* ver_coeff = uvg_g_dct_16;
+  const int16_t* ver_coeff = &uvg_g_dct_16[0][0];
   if (ver == DST7) {
-    ver_coeff = uvg_g_dst7_16;
+    ver_coeff = &uvg_g_dst7_16[0][0];
   }
   const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)ff_dct2_2x16_ver_result_shuffle);
   // No coeffs for DCT8 and DST7 transforms since they do not exist for this block size
@@ -2389,7 +2389,7 @@ void fast_forward_tr_2x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor,
   const int32_t shift_2nd = log2_height_minus1 + 7;
 
   const int16_t* hor_coeff = ff_dct2_2xN_coeff_hor;
-  const int16_t* ver_coeff = uvg_g_dct_32;
+  const int16_t* ver_coeff = &uvg_g_dct_32[0][0];
   // For result shuffling, can use existing shuffle vector
   const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)ff_dct2_2x16_ver_result_shuffle);
   // No coeffs for DCT8 and DST7 transforms since they do not exist for this block size
@@ -2562,7 +2562,7 @@ void fast_inverse_tr_2x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor,
   const int32_t shift_1st = INVERSE_SHIFT_1ST;
   const int32_t shift_2nd = INVERSE_SHIFT_2ND;
 
-  const int16_t* ver_coeff = uvg_g_dct_32_t; // rename
+  const int16_t* ver_coeff = &uvg_g_dct_32_t[0][0]; // rename
   const int16_t* hor_coeff = fi_dct2_32x2_coeff_ver; // TODO: rename
   // No coeffs for DCT8 and DST7 transforms since they do not exist for this block size
 
@@ -2986,16 +2986,16 @@ void fast_forward_tr_4x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor,
   const int32_t shift_2nd = log2_height_minus1 + 7;
 
   const int16_t* hor_coeff = fast_forward_dct2_b4_coeff;
-  const int16_t* ver_coeff = uvg_g_dct_16;
+  const int16_t* ver_coeff = &uvg_g_dct_16[0][0];
   if (hor == DST7) {
     hor_coeff = fast_forward_dst7_b4_coeff;
   } else if (hor == DCT8) {
     hor_coeff = fast_forward_dct8_b4_coeff;
   }
   if (ver == DST7) {
-    ver_coeff = uvg_g_dst7_16;
+    ver_coeff = &uvg_g_dst7_16[0][0];
   } else if (ver == DCT8) {
-    ver_coeff = uvg_g_dct8_16;
+    ver_coeff = &uvg_g_dct8_16[0][0];
   }
 
   __m256i v_hor_pass_out[4];
@@ -3415,7 +3415,7 @@ void fast_inverse_tr_4x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor,
   const int32_t shift_1st = INVERSE_SHIFT_1ST;
   const int32_t shift_2nd = INVERSE_SHIFT_2ND;
 
-  const int16_t* ver_coeff = uvg_g_dct_32_t;
+  const int16_t* ver_coeff = &uvg_g_dct_32_t[0][0];
   const int16_t* hor_coeff = fi_dct2_32x4_coeff_ver; // TODO: rename
   if (hor == DST7) {
     hor_coeff = fi_dst7_32x4_coeff_ver; // TODO: rename
@@ -3423,9 +3423,9 @@ void fast_inverse_tr_4x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor,
     hor_coeff = fi_dct8_32x4_coeff_ver; // TODO: rename
   }
   if (ver == DST7) {
-    ver_coeff = uvg_g_dst7_32_t;
+    ver_coeff = &uvg_g_dst7_32_t[0][0];
   } else if (ver == DCT8) {
-    ver_coeff = uvg_g_dct8_32;
+    ver_coeff = &uvg_g_dct8_32[0][0];
   }
 
   __m256i v_ver_pass_out[8];
@@ -4587,7 +4587,7 @@ void fast_inverse_tr_8x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor,
   const int32_t shift_1st = INVERSE_SHIFT_1ST;
   const int32_t shift_2nd = INVERSE_SHIFT_2ND;
 
-  const int16_t* ver_coeff = uvg_g_dct_32_t;
+  const int16_t* ver_coeff = &uvg_g_dct_32_t[0][0];
   const int16_t* hor_coeff = fi_dct2_32x8_coeff_ver; // TODO: rename table
   if (hor == DST7) {
     hor_coeff = fi_dst7_32x8_coeff_ver; // TODO: rename
@@ -4595,9 +4595,9 @@ void fast_inverse_tr_8x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor,
     hor_coeff = fi_dct8_32x8_coeff_ver; // TODO: rename
   }
   if (ver == DST7) {
-    ver_coeff = uvg_g_dst7_32_t;
+    ver_coeff = &uvg_g_dst7_32_t[0][0];
   } else if (ver == DCT8) {
-    ver_coeff = uvg_g_dct8_32;
+    ver_coeff = &uvg_g_dct8_32[0][0];
   }
 
   __m256i v_ver_pass_out[16];
@@ -5949,7 +5949,7 @@ void fast_inverse_tr_16x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor,
   const int32_t shift_1st = INVERSE_SHIFT_1ST;
   const int32_t shift_2nd = INVERSE_SHIFT_2ND;
 
-  const int16_t* ver_coeff = uvg_g_dct_32_t;
+  const int16_t* ver_coeff = &uvg_g_dct_32_t[0][0];
   const int16_t* hor_coeff = fi_dct2_16x16_coeff_hor;
   if (hor == DST7) {
     hor_coeff = fi_dst7_16x32_coeff_hor; // TODO: coeffs
@@ -5957,9 +5957,9 @@ void fast_inverse_tr_16x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor,
     hor_coeff = fi_dct8_16x32_coeff_hor;
   }
   if (ver == DST7) {
-    ver_coeff = uvg_g_dst7_32_t;
+    ver_coeff = &uvg_g_dst7_32_t[0][0];
   } else if (ver == DCT8) {
-    ver_coeff = uvg_g_dct8_32;
+    ver_coeff = &uvg_g_dct8_32[0][0];
   }
 
   __m256i v_ver_pass_out[32];
@@ -6108,8 +6108,8 @@ static void fast_forward_DCT2_32x2_avx2_ver(const __m256i* src, int16_t* dst, in
   // Prepare coeffs
   // TODO: either rename these old coeff tables to be consistent with other new avx2 functions
   // or construct them here in place. Should be ease to accomplish with set1_epi32, just use a int32_t combined from two int16_t
-  const __m256i v_coeff_0 = _mm256_load_si256((const __m256i*)fast_forward_dct2_b2_coeff[0]);
-  const __m256i v_coeff_1 = _mm256_load_si256((const __m256i*)fast_forward_dct2_b2_coeff[16]);
+  const __m256i v_coeff_0 = _mm256_load_si256((const __m256i*)&fast_forward_dct2_b2_coeff[0]);
+  const __m256i v_coeff_1 = _mm256_load_si256((const __m256i*)&fast_forward_dct2_b2_coeff[16]);
   
   // Got data for 4 vectors, 32 lines with 2 samples each
   __m256i v_result_e[4];
@@ -6147,7 +6147,7 @@ static void fast_forward_DCT2_32x4_avx2_ver(const __m256i* src, int16_t* dst, in
   // Got data for 8 vectors, 32 lines with 4 samples each
 
   // Prepare coeffs
-  const int16_t* coeff = uvg_g_dct_4;
+  const int16_t* coeff = &uvg_g_dct_4[0][0];
   const int a = coeff[0];
   const int b = coeff[1 * 4 + 0];
   const int c = coeff[1 * 4 + 1];
@@ -6891,11 +6891,11 @@ void fast_inverse_tr_32x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor,
   const int32_t shift_2nd = INVERSE_SHIFT_2ND;
 
   const int16_t* ver_coeff = fi_dct2_4x32_coeff_hor; // TODO: rename
-  const int16_t* hor_coeff = uvg_g_dct_32_t;
+  const int16_t* hor_coeff = &uvg_g_dct_32_t[0][0];
   if (hor == DST7) {
-    hor_coeff = uvg_g_dst7_32_t;
+    hor_coeff = &uvg_g_dst7_32_t[0][0];
   } else if (hor == DCT8) {
-    hor_coeff = uvg_g_dct8_32;
+    hor_coeff = &uvg_g_dct8_32[0][0];
   }
   if (ver == DST7) {
     ver_coeff = fi_dst7_4x32_coeff_hor; // TODO: rename
@@ -8023,7 +8023,7 @@ void fast_inverse_tr_32x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor,
   const int32_t shift_1st = INVERSE_SHIFT_1ST;
   const int32_t shift_2nd = INVERSE_SHIFT_2ND;
 
-  const int16_t* ver_coeff = uvg_g_dct_32_t;
+  const int16_t* ver_coeff = &uvg_g_dct_32_t[0][0];
   const int16_t* hor_coeff = fi_dct2_32xN_coeff_hor;
   if (hor == DST7) {
     hor_coeff = fi_dst7_32xN_coeff_hor;
@@ -8031,9 +8031,9 @@ void fast_inverse_tr_32x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor,
     hor_coeff = fi_dct8_32xN_coeff_hor;
   }
   if (ver == DST7) {
-    ver_coeff = uvg_g_dst7_32_t;
+    ver_coeff = &uvg_g_dst7_32_t[0][0];
   } else if (ver == DCT8) {
-    ver_coeff = uvg_g_dct8_32;
+    ver_coeff = &uvg_g_dct8_32[0][0];
   }
 
   __m256i v_ver_pass_out[64];
diff --git a/src/strategies/avx2/dct_avx2_tables.h b/src/strategies/avx2/dct_avx2_tables.h
index 946ab6b8..47900966 100644
--- a/src/strategies/avx2/dct_avx2_tables.h
+++ b/src/strategies/avx2/dct_avx2_tables.h
@@ -4830,101 +4830,5 @@ typedef int16_t TMatrixCoeff;
   {  b, -d,  f, -h,  j, -l,  n, -p,  r, -t,  v, -x,  z, -B,  D, -F,  E, -C,  A, -y,  w, -u,  s, -q,  o, -m,  k, -i,  g, -e,  c, -a,}, \
 }
 
-#define TRANSFORM_NUMBER_OF_DIRECTIONS 1
-#define ALIGN_DATA(nBytes,v) __declspec(align(nBytes)) v
-#define MEMORY_ALIGN_DEF_SIZE       32  // for use with avx2 (256 bit)
-//--------------------------------------------------------------------------------------------------
-// DCT-2
-ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDCT2P2[TRANSFORM_NUMBER_OF_DIRECTIONS][2][2]) =
-{
-  DEFINE_DCT2_P2_MATRIX(64),
-  //DEFINE_DCT2_P2_MATRIX(64)
-};
-
-ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDCT2P4[TRANSFORM_NUMBER_OF_DIRECTIONS][4][4]) =
-{
-  DEFINE_DCT2_P4_MATRIX(64,    83,    36),
-  //DEFINE_DCT2_P4_MATRIX(64,    83,    36)
-};
-
-ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDCT2P8[TRANSFORM_NUMBER_OF_DIRECTIONS][8][8]) =
-{
-  DEFINE_DCT2_P8_MATRIX(64,    83,    36,    89,    75,    50,    18),
-  //DEFINE_DCT2_P8_MATRIX(64,    83,    36,    89,    75,    50,    18)
-};
-
-ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDCT2P16[TRANSFORM_NUMBER_OF_DIRECTIONS][16][16]) =
-{
-  DEFINE_DCT2_P16_MATRIX(64,    83,    36,    89,    75,    50,    18,    90,    87,    80,    70,    57,    43,    25,     9),
-  //DEFINE_DCT2_P16_MATRIX(64,    83,    36,    89,    75,    50,    18,    90,    87,    80,    70,    57,    43,    25,     9)
-};
-
-ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDCT2P32[TRANSFORM_NUMBER_OF_DIRECTIONS][32][32]) =
-{
-  DEFINE_DCT2_P32_MATRIX(64,    83,    36,    89,    75,    50,    18,    90,    87,    80,    70,    57,    43,    25,     9,    90,    90,    88,    85,    82,    78,    73,    67,    61,    54,    46,    38,    31,    22,    13,     4),
-  //DEFINE_DCT2_P32_MATRIX(64,    83,    36,    89,    75,    50,    18,    90,    87,    80,    70,    57,    43,    25,     9,    90,    90,    88,    85,    82,    78,    73,    67,    61,    54,    46,    38,    31,    22,    13,     4)
-};
-
-ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDCT2P64[TRANSFORM_NUMBER_OF_DIRECTIONS][64][64]) =
-{
-  DEFINE_DCT2_P64_MATRIX(64,    83,    36,    89,    75,    50,    18,    90,    87,    80,    70,    57,    43,    25,     9,    90,    90,    88,    85,    82,    78,    73,    67,    61,    54,    46,    38,    31,    22,    13,     4,    91,    90,    90,    90,    88,    87,    86,    84,    83,    81,    79,    77,    73,    71,    69,    65,    62,    59,    56,    52,    48,    44,    41,    37,    33,    28,    24,    20,    15,    11,     7,     2),
-  //DEFINE_DCT2_P64_MATRIX(64,    83,    36,    89,    75,    50,    18,    90,    87,    80,    70,    57,    43,    25,     9,    90,    90,    88,    85,    82,    78,    73,    67,    61,    54,    46,    38,    31,    22,    13,     4,    91,    90,    90,    90,    88,    87,    86,    84,    83,    81,    79,    77,    73,    71,    69,    65,    62,    59,    56,    52,    48,    44,    41,    37,    33,    28,    24,    20,    15,    11,     7,     2)
-};
-
-// DCT-8
-ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDCT8P4[TRANSFORM_NUMBER_OF_DIRECTIONS][4][4]) =
-{
-  DEFINE_DCT8_P4_MATRIX(84,     74,     55,     29),
-  //DEFINE_DCT8_P4_MATRIX(84,     74,     55,     29)
-};
-ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDCT8P8[TRANSFORM_NUMBER_OF_DIRECTIONS][8][8]) =
-{
-  DEFINE_DCT8_P8_MATRIX(86,     85,     78,     71,     60,     46,     32,     17),
-  //DEFINE_DCT8_P8_MATRIX(86,     85,     78,     71,     60,     46,     32,     17)
-};
-ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDCT8P16[TRANSFORM_NUMBER_OF_DIRECTIONS][16][16]) =
-{
-  DEFINE_DCT8_P16_MATRIX(88,     88,     87,     85,     81,     77,     73,     68,     62,     55,     48,     40,     33,     25,     17,      8),
-  //DEFINE_DCT8_P16_MATRIX(88,     88,     87,     85,     81,     77,     73,     68,     62,     55,     48,     40,     33,     25,     17,      8)
-};
-ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDCT8P32[TRANSFORM_NUMBER_OF_DIRECTIONS][32][32]) =
-{
-  DEFINE_DCT8_P32_MATRIX(90,     90,     89,     88,     87,     86,     85,     84,     82,     80,     78,     77,     74,     72,     68,     66,     63,     60,     56,     53,     50,     46,     42,     38,     34,     30,     26,     21,     17,     13,      9,      4),
-  //DEFINE_DCT8_P32_MATRIX(90,     90,     89,     88,     87,     86,     85,     84,     82,     80,     78,     77,     74,     72,     68,     66,     63,     60,     56,     53,     50,     46,     42,     38,     34,     30,     26,     21,     17,     13,      9,      4)
-};
-
-// DST-7
-ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDST7P4[TRANSFORM_NUMBER_OF_DIRECTIONS][4][4]) =
-{
-  DEFINE_DST7_P4_MATRIX(29,    55,    74,    84),
-  //DEFINE_DST7_P4_MATRIX(29,    55,    74,    84)
-};
-ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDST7P8[TRANSFORM_NUMBER_OF_DIRECTIONS][8][8]) =
-{
-  DEFINE_DST7_P8_MATRIX(17,    32,    46,    60,    71,    78,    85,    86),
-  //DEFINE_DST7_P8_MATRIX(17,    32,    46,    60,    71,    78,    85,    86)
-};
-ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDST7P16[TRANSFORM_NUMBER_OF_DIRECTIONS][16][16]) =
-{
-  DEFINE_DST7_P16_MATRIX(8,    17,    25,    33,    40,    48,    55,    62,    68,    73,    77,    81,    85,    87,    88,    88),
-  //DEFINE_DST7_P16_MATRIX(8,    17,    25,    33,    40,    48,    55,    62,    68,    73,    77,    81,    85,    87,    88,    88)
-};
-ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDST7P32[TRANSFORM_NUMBER_OF_DIRECTIONS][32][32]) =
-{
-  DEFINE_DST7_P32_MATRIX(4,     9,    13,    17,    21,    26,    30,    34,    38,    42,    46,    50,    53,    56,    60,    63,    66,    68,    72,    74,    77,    78,    80,    82,    84,    85,    86,    87,    88,    89,    90,    90),
-  //DEFINE_DST7_P32_MATRIX(4,     9,    13,    17,    21,    26,    30,    34,    38,    42,    46,    50,    53,    56,    60,    63,    66,    68,    72,    74,    77,    78,    80,    82,    84,    85,    86,    87,    88,    89,    90,    90)
-};
-
-//--------------------------------------------------------------------------------------------------
-
-static const int16_t* vvenc_matrix_coeffs[3][6] = {
-  {g_trCoreDCT2P2[0][0], g_trCoreDCT2P4[0][0], g_trCoreDCT2P8[0][0], g_trCoreDCT2P16[0][0], g_trCoreDCT2P32[0][0], g_trCoreDCT2P64[0][0]},
-  {NULL,  g_trCoreDCT8P4[0][0], g_trCoreDCT8P8[0][0], g_trCoreDCT8P16[0][0], g_trCoreDCT8P32[0][0], NULL},
-  {NULL,  g_trCoreDST7P4[0][0], g_trCoreDST7P8[0][0], g_trCoreDST7P16[0][0], g_trCoreDST7P32[0][0], NULL},
-};
-
-//! \}
-
-
 
 #endif DCT_AVX2_TABLES_H

From d62a3f888e8ae89a6c7f7a3f2865294692baf136 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 26 Jul 2023 15:24:31 +0300
Subject: [PATCH 246/254] [avx2] static all transform tables

---
 src/strategies/avx2/dct-avx2.c        |  96 +++---
 src/strategies/avx2/dct_avx2_tables.h | 419 +++++++++++++-------------
 2 files changed, 254 insertions(+), 261 deletions(-)

diff --git a/src/strategies/avx2/dct-avx2.c b/src/strategies/avx2/dct-avx2.c
index 566efba3..0610162e 100644
--- a/src/strategies/avx2/dct-avx2.c
+++ b/src/strategies/avx2/dct-avx2.c
@@ -82,7 +82,7 @@ static INLINE __m256i truncate_avx2(__m256i v, __m256i debias, int32_t shift)
 // TODO: find avx2 solution for transpose
 // TODO: attempt to make a generic transpose for avx2. Needs some extra logic for different widths and heights.
 // TODO: make a few solutions for exact sizes and see if some pattern emerges...
-void transpose_matrix(const int16_t* src, int16_t* dst, const int width, const int height) {
+static void transpose_matrix(const int16_t* src, int16_t* dst, const int width, const int height) {
   const int sample_num = width * height;
   const int vectors = sample_num / 16;
 
@@ -150,7 +150,7 @@ void transpose_matrix(const int16_t* src, int16_t* dst, const int width, const i
   }
 }
 
-void transpose_generic(const int16_t* src, int16_t* dst, const int width, const int height)
+static void transpose_generic(const int16_t* src, int16_t* dst, const int width, const int height)
 {
   for (int y = 0; y < height; ++y) {
     for (int x = 0; x < width; ++x) {
@@ -644,7 +644,7 @@ static transpose_func* transpose_func_table[6][6] = {
 
 
 // Dispatcher function for avx2 transposes. This calls the proper subfunction
-void transpose_avx2(const __m256i* src, __m256i* dst, const int width, const int height)
+static void transpose_avx2(const __m256i* src, __m256i* dst, const int width, const int height)
 {
   // No need to transpose something of width or height 1
   const int w_log2_minus1 = uvg_g_convert_to_log2[width] - 1;
@@ -2043,7 +2043,7 @@ static void fast_forward_tr_2xN_avx2_hor(const int16_t* src, __m256i* dst, const
   }
 }
 
-void fast_forward_tr_2x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+static void fast_forward_tr_2x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
 {
   const int width = 2;
   const int height = 8;
@@ -2162,7 +2162,7 @@ static void fast_inverse_tr_2x8_avx2_hor(const __m256i* src, int16_t* dst, const
   _mm256_store_si256((__m256i*)dst, v_result);
 }
 
-void fast_inverse_tr_2x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+static void fast_inverse_tr_2x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
 {
   const int width = 2;
   const int height = 8;
@@ -2187,7 +2187,7 @@ void fast_inverse_tr_2x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, t
 }
 
 
-void fast_forward_tr_2x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+static void fast_forward_tr_2x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
 {
   const int width = 2;
   const int height = 16;
@@ -2350,7 +2350,7 @@ static void fast_inverse_tr_2x16_avx2_hor(const __m256i* src, int16_t* dst, cons
   _mm256_store_si256((__m256i*) & dst[16], v_result_1);
 }
 
-void fast_inverse_tr_2x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+static void fast_inverse_tr_2x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
 {
   const int width = 2;
   const int height = 16;
@@ -2375,7 +2375,7 @@ void fast_inverse_tr_2x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor,
 }
 
 
-void fast_forward_tr_2x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+static void fast_forward_tr_2x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
 {
   const int width = 2;
   const int height = 32;
@@ -2551,7 +2551,7 @@ static void fast_inverse_tr_2x32_avx2_hor(const __m256i* src, int16_t* dst, cons
   }
 }
 
-void fast_inverse_tr_2x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+static void fast_inverse_tr_2x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
 {
   const int width = 2;
   const int height = 32;
@@ -2574,7 +2574,7 @@ void fast_inverse_tr_2x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor,
 }
 
 
-void fast_forward_tr_4xN_avx2_hor(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+static void fast_forward_tr_4xN_avx2_hor(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
 {
   const int32_t    add = (shift > 0) ? (1 << (shift - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
   const __m256i debias = _mm256_set1_epi32(add);
@@ -2620,7 +2620,7 @@ void fast_forward_tr_4xN_avx2_hor(const int16_t* src, __m256i* dst, const int16_
   }
 }
 
-void fast_forward_tr_4x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+static void fast_forward_tr_4x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
 {
   const int width = 4;
   const int height = 4;
@@ -2733,7 +2733,7 @@ static void fast_inverse_tr_4x4_avx2_ver(const __m256i* src, int16_t* dst, const
   _mm256_store_si256((__m256i*)dst, v_result);
 }
 
-void fast_inverse_tr_4x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+static void fast_inverse_tr_4x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
 {
   const int width = 4;
   const int height = 4;
@@ -2764,7 +2764,7 @@ void fast_inverse_tr_4x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, t
 }
 
 
-void fast_forward_tr_4x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+static void fast_forward_tr_4x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
 {
   const int width = 4;
   const int height = 8;
@@ -2941,7 +2941,7 @@ static void fast_inverse_tr_4x8_avx2_hor(const __m256i* src, int16_t* dst, const
   _mm256_store_si256((__m256i*) & dst[16], v_result_1);
 }
 
-void fast_inverse_tr_4x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+static void fast_inverse_tr_4x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
 {
   const int width = 4;
   const int height = 8;
@@ -2972,7 +2972,7 @@ void fast_inverse_tr_4x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, t
 }
 
 
-void fast_forward_tr_4x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+static void fast_forward_tr_4x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
 {
   const int width = 4;
   const int height = 16;
@@ -3169,7 +3169,7 @@ static void fast_inverse_tr_4x16_avx2_hor(const __m256i* src, int16_t* dst, cons
   }
 }
 
-void fast_inverse_tr_4x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+static void fast_inverse_tr_4x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
 {
   const int width = 4;
   const int height = 16;
@@ -3200,7 +3200,7 @@ void fast_inverse_tr_4x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor,
 }
 
 
-void fast_forward_tr_4x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+static void fast_forward_tr_4x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
 {
   const int width = 4;
   const int height = 32;
@@ -3404,7 +3404,7 @@ static void fast_inverse_tr_4x32_avx2_hor(const __m256i* src, int16_t* dst, cons
   }
 }
 
-void fast_inverse_tr_4x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+static void fast_inverse_tr_4x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
 {
   const int width = 4;
   const int height = 32;
@@ -3495,7 +3495,7 @@ static void fast_forward_tr_8xN_avx2_hor(const int16_t* src, __m256i* dst, const
   }
 }
 
-void fast_forward_tr_8x2_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+static void fast_forward_tr_8x2_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
 {
   const int width = 8;
   const int height = 2;
@@ -3613,7 +3613,7 @@ static void fast_inverse_tr_8x2_avx2_hor(const __m256i* src, int16_t* dst, const
   _mm256_store_si256((__m256i*)dst, v_result);
 }
 
-void fast_inverse_tr_8x2_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+static void fast_inverse_tr_8x2_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
 {
   const int width = 8;
   const int height = 2;
@@ -3637,7 +3637,7 @@ void fast_inverse_tr_8x2_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, t
   fast_inverse_tr_8x2_avx2_hor(&v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width);
 }
 
-void fast_forward_tr_8x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+static void fast_forward_tr_8x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
 {
   const int width = 8;
   const int height = 4;
@@ -3804,7 +3804,7 @@ static void fast_inverse_tr_8x4_avx2_hor(const __m256i* src, int16_t* dst, const
   _mm256_store_si256((__m256i*) & dst[16], v_result[1]);
 }
 
-void fast_inverse_tr_8x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+static void fast_inverse_tr_8x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
 {
   const int width = 8;
   const int height = 4;
@@ -3835,7 +3835,7 @@ void fast_inverse_tr_8x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, t
 }
 
 
-void fast_forward_tr_8x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+static void fast_forward_tr_8x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
 {
   const int width = 8;
   const int height = 8;
@@ -4052,7 +4052,7 @@ static void fast_inverse_tr_8x8_avx2_ver(const __m256i* src, int16_t* dst, const
   }
 }
 
-void fast_inverse_tr_8x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+static void fast_inverse_tr_8x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
 {
   const int width = 8;
   const int height = 8;
@@ -4083,7 +4083,7 @@ void fast_inverse_tr_8x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, t
 }
 
 
-void fast_forward_tr_8x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+static void fast_forward_tr_8x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
 {
   const int width = 8;
   const int height = 16;
@@ -4331,7 +4331,7 @@ static void fast_inverse_tr_8x16_avx2_hor(const __m256i* src, int16_t* dst, cons
   }
 }
 
-void fast_inverse_tr_8x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+static void fast_inverse_tr_8x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
 {
   const int width = 8;
   const int height = 16;
@@ -4362,7 +4362,7 @@ void fast_inverse_tr_8x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor,
 }
 
 
-void fast_forward_tr_8x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+static void fast_forward_tr_8x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
 {
   const int width = 8;
   const int height = 32;
@@ -4576,7 +4576,7 @@ static void fast_inverse_tr_8x32_avx2_hor(const __m256i* src, int16_t* dst, cons
   // TODO: mts cutoff
 }
 
-void fast_inverse_tr_8x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+static void fast_inverse_tr_8x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
 {
   const int width = 8;
   const int height = 32;
@@ -4689,7 +4689,7 @@ static void fast_forward_DCT2_B16_avx2_hor(const int16_t* src, __m256i* dst, con
   }
 }
 
-void fast_forward_tr_16x2_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+static void fast_forward_tr_16x2_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
 {
   const int width = 16;
   const int height = 2;
@@ -4806,7 +4806,7 @@ static void fast_inverse_tr_16x2_avx2_hor(const __m256i* src, int16_t* dst, cons
   _mm256_store_si256((__m256i*) & dst[16], v_result_1);
 }
 
-void fast_inverse_tr_16x2_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+static void fast_inverse_tr_16x2_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
 {
   const int width = 16;
   const int height = 2;
@@ -4831,7 +4831,7 @@ void fast_inverse_tr_16x2_avx2(const int16_t* src, int16_t* dst, tr_type_t hor,
 }
 
 
-void fast_forward_tr_16x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+static void fast_forward_tr_16x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
 {
   const int width = 16;
   const int height = 4;
@@ -5039,7 +5039,7 @@ static void fast_inverse_tr_16x4_avx2_hor(const __m256i* src, int16_t* dst, cons
   }
 }
 
-void fast_inverse_tr_16x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+static void fast_inverse_tr_16x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
 {
   const int width = 16;
   const int height = 4;
@@ -5070,7 +5070,7 @@ void fast_inverse_tr_16x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor,
 }
 
 
-void fast_forward_tr_16x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+static void fast_forward_tr_16x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
 {
   const int width = 16;
   const int height = 8;
@@ -5293,7 +5293,7 @@ static void fast_inverse_tr_16x8_avx2_hor(const __m256i* src, int16_t* dst, cons
   }
 }
 
-void fast_inverse_tr_16x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+static void fast_inverse_tr_16x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
 {
   const int width = 16;
   const int height = 8;
@@ -5324,7 +5324,7 @@ void fast_inverse_tr_16x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor,
 }
 
 
-void fast_forward_tr_16x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+static void fast_forward_tr_16x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
 {
   const int width = 16;
   const int height = 16;
@@ -5629,7 +5629,7 @@ static void fast_inverse_tr_16x16_avx2_ver(const __m256i* src, int16_t* dst, con
   //transpose_avx2(v_result, (__m256i*)dst, 16, 16);
 }
 
-void fast_inverse_tr_16x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+static void fast_inverse_tr_16x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
 {
   const int width = 16;
   const int height = 16;
@@ -5660,7 +5660,7 @@ void fast_inverse_tr_16x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor,
 }
 
 
-void fast_forward_tr_16x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+static void fast_forward_tr_16x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
 {
   const int width = 16;
   const int height = 32;
@@ -5938,7 +5938,7 @@ static void fast_inverse_tr_16x32_avx2_hor(const __m256i* src, int16_t* dst, con
   }
 }
 
-void fast_inverse_tr_16x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+static void fast_inverse_tr_16x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
 {
   const int width = 16;
   const int height = 32;
@@ -6285,7 +6285,7 @@ static void fast_forward_DCT2_32x8_avx2_ver(const __m256i* src, int16_t* dst, in
 }
 
 
-void fast_forward_tr_32x2_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+static void fast_forward_tr_32x2_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
 {
   const int width = 32;
   const int height = 2;
@@ -6450,7 +6450,7 @@ static void fast_inverse_tr_32x2_avx2_hor(const __m256i* src, int16_t* dst, cons
   // TODO: cutoff for DCT8 and DST7
 }
 
-void fast_inverse_tr_32x2_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+static void fast_inverse_tr_32x2_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
 {
   const int width = 32;
   const int height = 2;
@@ -6472,7 +6472,7 @@ void fast_inverse_tr_32x2_avx2(const int16_t* src, int16_t* dst, tr_type_t hor,
 }
 
 
-void fast_forward_tr_32x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+static void fast_forward_tr_32x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
 {
   const int width = 32;
   const int height = 4;
@@ -6879,7 +6879,7 @@ static void fast_inverse_tr_32x4_avx2_mts_hor(const __m256i* src, int16_t* dst,
   // TODO: cutoff for dct8 and dst7
 }
 
-void fast_inverse_tr_32x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+static void fast_inverse_tr_32x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
 {
   const int width = 32;
   const int height = 4;
@@ -6920,7 +6920,7 @@ void fast_inverse_tr_32x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor,
 }
 
 
-void fast_forward_tr_32x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+static void fast_forward_tr_32x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
 {
   const int width = 32;
   const int height = 8;
@@ -7206,7 +7206,7 @@ static void fast_inverse_tr_32x8_avx2_hor(const __m256i* src, int16_t* dst, cons
   // TODO: cutoff for dct8 and dst7
 }
 
-void fast_inverse_tr_32x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+static void fast_inverse_tr_32x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
 {
   const int width = 32;
   const int height = 8;
@@ -7242,7 +7242,7 @@ void fast_inverse_tr_32x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor,
 }
 
 
-void fast_forward_tr_32x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+static void fast_forward_tr_32x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
 {
   const int width = 32;
   const int height = 16;
@@ -7615,7 +7615,7 @@ static void fast_inverse_tr_32x16_avx2_hor(const __m256i* src, int16_t* dst, con
   // TODO: MTS cutoff
 }
 
-void fast_inverse_tr_32x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+static void fast_inverse_tr_32x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
 {
   const int width = 32;
   const int height = 16;
@@ -7646,7 +7646,7 @@ void fast_inverse_tr_32x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor,
 }
 
 
-void fast_forward_tr_32x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+static void fast_forward_tr_32x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
 {
   const int width = 32;
   const int height = 32;
@@ -8012,7 +8012,7 @@ static void fast_inverse_tr_32x32_avx2_hor(const __m256i* src, int16_t* dst, con
   }
 }
 
-void fast_inverse_tr_32x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+static void fast_inverse_tr_32x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
 {
   const int width = 32;
   const int height = 32;
diff --git a/src/strategies/avx2/dct_avx2_tables.h b/src/strategies/avx2/dct_avx2_tables.h
index 47900966..5d02b617 100644
--- a/src/strategies/avx2/dct_avx2_tables.h
+++ b/src/strategies/avx2/dct_avx2_tables.h
@@ -5,15 +5,15 @@
 
 // Shuffle tables for simple avx2 functions
 
-ALIGNED(32) const int32_t  ff_dct2_b4_permute_0[8] = { 0, 2, 4, 6, 0, 2, 4, 6 };
-ALIGNED(32) const int32_t  ff_dct2_b4_permute_1[8] = { 1, 3, 5, 7, 1, 3, 5, 7 };
+ALIGNED(32) static const int32_t  ff_dct2_b4_permute_0[8] = { 0, 2, 4, 6, 0, 2, 4, 6 };
+ALIGNED(32) static const int32_t  ff_dct2_b4_permute_1[8] = { 1, 3, 5, 7, 1, 3, 5, 7 };
 
-ALIGNED(32) const int32_t  fi_dct2_b4_permute_0[8] = { 0, 0, 0, 0, 2, 2, 2, 2 };
-ALIGNED(32) const int32_t  fi_dct2_b4_permute_1[8] = { 4, 4, 4, 4, 6, 6, 6, 6 };
-ALIGNED(32) const int32_t  fi_dct2_b4_permute_2[8] = { 1, 1, 1, 1, 3, 3, 3, 3 };
-ALIGNED(32) const int32_t  fi_dct2_b4_permute_3[8] = { 5, 5, 5, 5, 7, 7, 7, 7 };
+ALIGNED(32) static const int32_t  fi_dct2_b4_permute_0[8] = { 0, 0, 0, 0, 2, 2, 2, 2 };
+ALIGNED(32) static const int32_t  fi_dct2_b4_permute_1[8] = { 4, 4, 4, 4, 6, 6, 6, 6 };
+ALIGNED(32) static const int32_t  fi_dct2_b4_permute_2[8] = { 1, 1, 1, 1, 3, 3, 3, 3 };
+ALIGNED(32) static const int32_t  fi_dct2_b4_permute_3[8] = { 5, 5, 5, 5, 7, 7, 7, 7 };
 
-ALIGNED(32) const int32_t  ff_dct2_b32_permute[8][8] = {
+ALIGNED(32) static const int32_t  ff_dct2_b32_permute[8][8] = {
   {0, 0, 0, 0, 0, 0, 0, 0},
   {1, 1, 1, 1, 1, 1, 1, 1},
   {2, 2, 2, 2, 2, 2, 2, 2},
@@ -27,29 +27,29 @@ ALIGNED(32) const int32_t  ff_dct2_b32_permute[8][8] = {
 
 // Coeff tables for simple avx2 functions
 
-ALIGNED(32) const int16_t  fast_forward_dct2_b2_coeff[32] = {
+ALIGNED(32) static const int16_t  fast_forward_dct2_b2_coeff[32] = {
   64,  64, 64,  64, 64,  64, 64,  64, 64,  64, 64,  64, 64,  64, 64,  64,
   64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64,
 };
 
-            const int16_t* fast_inverse_dct2_b2_coeff = fast_forward_dct2_b2_coeff; // Inverse coeffs for this transform are same as forward
+static const int16_t* fast_inverse_dct2_b2_coeff = fast_forward_dct2_b2_coeff; // Inverse coeffs for this transform are same as forward
 
 // Coeff arrays for B4
-ALIGNED(32) const int16_t  fast_forward_dct2_b4_coeff[64] = {
+ALIGNED(32) static const int16_t  fast_forward_dct2_b4_coeff[64] = {
  64,  64,  64,  64,  64,  64,  64,  64,  64, -64,  64, -64,  64, -64,  64, -64,
  64,  64,  64,  64,  64,  64,  64,  64, -64,  64, -64,  64, -64,  64, -64,  64,
  83,  36,  83,  36,  83,  36,  83,  36,  36, -83,  36, -83,  36, -83,  36, -83,
 -36, -83, -36, -83, -36, -83, -36, -83,  83, -36,  83, -36,  83, -36,  83, -36,
 };
 
-ALIGNED(32) const int16_t  fast_forward_dst7_b4_coeff[64] = {
+ALIGNED(32) static const int16_t  fast_forward_dst7_b4_coeff[64] = {
  29,  55,  29,  55,  29,  55,  29,  55,  84, -29,  84, -29,  84, -29,  84, -29,
  74,  84,  74,  84,  74,  84,  74,  84, -74,  55, -74,  55, -74,  55, -74,  55,
  74,  74,  74,  74,  74,  74,  74,  74,  55, -84,  55, -84,  55, -84,  55, -84,
   0, -74,   0, -74,   0, -74,   0, -74,  74, -29,  74, -29,  74, -29,  74, -29,
 };
 
-ALIGNED(32) const int16_t  fast_forward_dct8_b4_coeff[64] = {
+ALIGNED(32) static const int16_t  fast_forward_dct8_b4_coeff[64] = {
  84,  74,  84,  74,  84,  74,  84,  74,  55, -74,  55, -74,  55, -74,  55, -74,
  55,  29,  55,  29,  55,  29,  55,  29, -29,  84, -29,  84, -29,  84, -29,  84,
  74,   0,  74,   0,  74,   0,  74,   0,  29, -74,  29, -74,  29, -74,  29, -74,
@@ -57,21 +57,21 @@ ALIGNED(32) const int16_t  fast_forward_dct8_b4_coeff[64] = {
 };
 
 // Coeff arrays for inverse B4
-ALIGNED(32) const int16_t  fast_inverse_dct2_b4_coeff[64] = {
+ALIGNED(32) static const int16_t  fast_inverse_dct2_b4_coeff[64] = {
  64,  83,  64,  36,  64, -36,  64, -83,  64,  83,  64,  36,  64, -36,  64, -83,
  64,  36, -64, -83, -64,  83,  64, -36,  64,  36, -64, -83, -64,  83,  64, -36,
  64,  83,  64,  36,  64, -36,  64, -83,  64,  83,  64,  36,  64, -36,  64, -83,
  64,  36, -64, -83, -64,  83,  64, -36,  64,  36, -64, -83, -64,  83,  64, -36,
 };
 
-ALIGNED(32) const int16_t  fast_inverse_dst7_b4_coeff[64] = {
+ALIGNED(32) static const int16_t  fast_inverse_dst7_b4_coeff[64] = {
  29,  74,  55,  74,  74,   0,  84, -74,  29,  74,  55,  74,  74,   0,  84, -74,
  84,  55, -29, -84, -74,  74,  55, -29,  84,  55, -29, -84, -74,  74,  55, -29,
  29,  74,  55,  74,  74,   0,  84, -74,  29,  74,  55,  74,  74,   0,  84, -74,
  84,  55, -29, -84, -74,  74,  55, -29,  84,  55, -29, -84, -74,  74,  55, -29,
 };
 
-ALIGNED(32) const int16_t  fast_inverse_dct8_b4_coeff[64] = {
+ALIGNED(32) static const int16_t  fast_inverse_dct8_b4_coeff[64] = {
  84,  74,  74,   0,  55, -74,  29, -74,  84,  74,  74,   0,  55, -74,  29, -74,
  55,  29, -74, -74, -29,  84,  84, -55,  55,  29, -74, -74, -29,  84,  84, -55,
  84,  74,  74,   0,  55, -74,  29, -74,  84,  74,  74,   0,  55, -74,  29, -74,
@@ -79,7 +79,7 @@ ALIGNED(32) const int16_t  fast_inverse_dct8_b4_coeff[64] = {
 };
 
 // Coeff arrays for forward B8
-ALIGNED(32) const int16_t  fast_forward_dct2_b8_coeff[128] = {
+ALIGNED(32) static const int16_t  fast_forward_dct2_b8_coeff[128] = {
  64,  64,  89,  75,  83,  36,  75, -18,  64,  64,  89,  75,  83,  36,  75, -18,
  64,  64,  50,  18, -36, -83, -89, -50,  64,  64,  50,  18, -36, -83, -89, -50,
  64,  64, -18, -50, -83, -36,  50,  89,  64,  64, -18, -50, -83, -36,  50,  89,
@@ -90,7 +90,7 @@ ALIGNED(32) const int16_t  fast_forward_dct2_b8_coeff[128] = {
 -64,  64,  89, -50, -83,  36,  50, -18, -64,  64,  89, -50, -83,  36,  50, -18,
 };
 
-ALIGNED(32) const int16_t  fast_forward_dst7_b8_coeff[128] = {
+ALIGNED(32) static const int16_t  fast_forward_dst7_b8_coeff[128] = {
  17,  32,  46,  78,  71,  85,  85,  46,  17,  32,  46,  78,  71,  85,  85,  46,
  46,  60,  86,  71,  32, -46, -60, -78,  46,  60,  86,  71,  32, -46, -60, -78,
  71,  78,  32, -17, -86, -60,  17,  86,  71,  78,  32, -17, -86, -60,  17,  86,
@@ -101,7 +101,7 @@ ALIGNED(32) const int16_t  fast_forward_dst7_b8_coeff[128] = {
 -71,  60,  86, -46, -78,  32,  46, -17, -71,  60,  86, -46, -78,  32,  46, -17,
 };
 
-ALIGNED(32) const int16_t  fast_forward_dct8_b8_coeff[128] = {
+ALIGNED(32) static const int16_t  fast_forward_dct8_b8_coeff[128] = {
  86,  85,  85,  60,  78,  17,  71, -32,  86,  85,  85,  60,  78,  17,  71, -32,
  78,  71,  17, -32, -60, -86, -86, -17,  78,  71,  17, -32, -60, -86, -86, -17,
  60,  46, -71, -86, -46,  32,  78,  60,  60,  46, -71, -86, -46,  32,  78,  60,
@@ -113,7 +113,7 @@ ALIGNED(32) const int16_t  fast_forward_dct8_b8_coeff[128] = {
 };
 
 // Coeff arrays for inverse B8
-ALIGNED(32) const int16_t  fast_inverse_dct2_b8_coeff[128] = {
+ALIGNED(32) static const int16_t  fast_inverse_dct2_b8_coeff[128] = {
  64,  89,  64,  75,  64,  50,  64,  18,  64,  89,  64,  75,  64,  50,  64,  18,
  83,  75,  36, -18, -36, -89, -83, -50,  83,  75,  36, -18, -36, -89, -83, -50,
  64,  50, -64, -89, -64,  18,  64,  75,  64,  50, -64, -89, -64,  18,  64,  75,
@@ -124,7 +124,7 @@ ALIGNED(32) const int16_t  fast_inverse_dct2_b8_coeff[128] = {
 -36,  89,  83, -75, -83,  50,  36, -18, -36,  89,  83, -75, -83,  50,  36, -18,
 };
 
-ALIGNED(32) const int16_t  fast_inverse_dst7_b8_coeff[128] = {
+ALIGNED(32) static const int16_t  fast_inverse_dst7_b8_coeff[128] = {
  17,  46,  32,  78,  46,  86,  60,  71,  17,  46,  32,  78,  46,  86,  60,  71,
  71,  85,  85,  46,  32, -60, -46, -78,  71,  85,  85,  46,  32, -60, -46, -78,
  86,  78, -17, -71, -85, -17,  32,  85,  86,  78, -17, -71, -85, -17,  32,  85,
@@ -135,10 +135,10 @@ ALIGNED(32) const int16_t  fast_inverse_dst7_b8_coeff[128] = {
 -46,  85,  85, -71, -78,  46,  32, -17, -46,  85,  85, -71, -78,  46,  32, -17,
 };
 
-            const int16_t* fast_inverse_dct8_b8_coeff = fast_forward_dct8_b8_coeff; // The table used in forward transform works with inverse also.
+static const int16_t* fast_inverse_dct8_b8_coeff = fast_forward_dct8_b8_coeff; // The table used in forward transform works with inverse also.
 
 // Coeff arrays for forward B16
-ALIGNED(32) const int16_t  fast_forward_dct2_b16_coeff[256] = {
+ALIGNED(32) static const int16_t  fast_forward_dct2_b16_coeff[256] = {
  64,  64,  90,  87,  89,  75,  87,  57,  64, -64,  57, -80,  50, -89,  43, -90,
  64,  64,  80,  70,  50,  18,   9, -43, -64,  64, -25,  90,  18,  75,  57,  25,
  64,  64,  57,  43, -18, -50, -80, -90,  64, -64,  -9, -87, -75, -18, -87,  70,
@@ -157,7 +157,7 @@ ALIGNED(32) const int16_t  fast_forward_dct2_b16_coeff[256] = {
  36,  83,  -9, -80, -18,  75,  43, -70, -83,  36,  70, -25, -50,  18,  25,  -9,
 };
 
-ALIGNED(32) const int16_t  fast_forward_dst7_b16_coeff[256] = {
+ALIGNED(32) static const int16_t  fast_forward_dst7_b16_coeff[256] = {
   8,  17,  25,  48,  40,  73,  55,  87,  88,  -8,  87, -40,  81, -68,  73, -85,  // 0
  25,  33,  68,  81,  88,  85,  81,  40, -88,  17, -68,  73, -25,  88,  25,  55,
  40,  48,  88,  88,  62,  25, -17, -68,  87, -25,  33, -88, -48, -48, -88,  48,
@@ -176,7 +176,7 @@ ALIGNED(32) const int16_t  fast_forward_dst7_b16_coeff[256] = {
  25,  81,   0, -77, -25,  73,  48, -68, -81,  33,  68, -25, -48,  17,  25,  -8,
 };
 
-ALIGNED(32) const int16_t  fast_forward_dct8_b16_coeff[256] = {
+ALIGNED(32) static const int16_t  fast_forward_dct8_b16_coeff[256] = {
  88,  88,  88,  81,  87,  68,  85,  48,  62, -68,  55, -81,  48, -88,  40, -88,  // 0
  87,  85,  68,  48,  33,  -8,  -8, -62, -55,  73, -17,  88,  25,  68,  62,  17,
  81,  77,  25,   0, -48, -77, -88, -77,  48, -77, -25, -77, -81,   0, -81,  77,
@@ -196,7 +196,7 @@ ALIGNED(32) const int16_t  fast_forward_dct8_b16_coeff[256] = {
 };
 
 // Coeff arrays for inverse B16
-ALIGNED(32) const int16_t  fast_inverse_dct2_b16_coeff[256] = {
+ALIGNED(32) static const int16_t  fast_inverse_dct2_b16_coeff[256] = {
    64,  90,  64,  87,  64,  80,  64,  70,  64,  -9,  64, -25,  64, -43,  64, -57,
    89,  87,  75,  57,  50,   9,  18, -43, -89,  25, -75,  70, -50,  90, -18,  80,
    83,  80,  36,   9, -36, -70, -83, -87,  83, -43,  36, -90, -36, -57, -83,  25,
@@ -215,7 +215,7 @@ ALIGNED(32) const int16_t  fast_inverse_dct2_b16_coeff[256] = {
    89,  70, -75, -80,  50,  87, -18, -90, -89,  57,  75, -43, -50,  25,  18,  -9,
 };
 
-ALIGNED(32) const int16_t  fast_inverse_dst7_b16_coeff[256] = {
+ALIGNED(32) static const int16_t  fast_inverse_dst7_b16_coeff[256] = {
   8,  25,  17,  48,  25,  68,  33,  81,  68,  48,  73,  25,  77,   0,  81, -25,  // 0
  40,  55,  73,  87,  88,  81,  85,  40, -81, -25, -88,  33, -77,  77, -48,  88,
  68,  77,  88,  77,  48,   0, -25, -77,  88,   0,  68, -77,   0, -77, -68,   0,
@@ -234,10 +234,10 @@ ALIGNED(32) const int16_t  fast_inverse_dst7_b16_coeff[256] = {
  85,  73, -68, -81,  40,  87,  -8, -88, -87,  55,  73, -40, -48,  25,  17,  -8,
 };
 
-            const int16_t* fast_inverse_dct8_b16_coeff = fast_forward_dct8_b16_coeff;
+static const int16_t* fast_inverse_dct8_b16_coeff = fast_forward_dct8_b16_coeff;
 
 // Coeff arrays for forward B32
-ALIGNED(32) const int16_t  fast_forward_dct2_b32_coeff[1024] = {
+ALIGNED(32) static const int16_t  fast_forward_dct2_b32_coeff[1024] = {
  64,  64,  90,  90,  90,  87,  90,  82,  89,  75,  88,  67,  87,  57,  85,  46,  // 0
  83,  36,  82,  22,  80,   9,  78,  -4,  75, -18,  73, -31,  70, -43,  67, -54,
  64, -64,  61, -73,  57, -80,  54, -85,  50, -89,  46, -90,  43, -90,  38, -88,
@@ -304,7 +304,7 @@ ALIGNED(32) const int16_t  fast_forward_dct2_b32_coeff[1024] = {
 -83,  36,  78, -31, -70,  25,  61, -22, -50,  18,  38, -13, -25,   9,  13,  -4,
 };
 
-ALIGNED(32) const int16_t  fast_forward_dst7_b32_coeff[1024] = {
+ALIGNED(32) static const int16_t  fast_forward_dst7_b32_coeff[1024] = {
   4,   9,  13,  26,  21,  42,  30,  56,  38,  68,  46,  78,  53,  85,  60,  89,  // 0
  66,  90,  72,  86,  77,  80,  80,  72,  84,  60,  86,  46,  88,  30,  90,  13,
  90,  -4,  89, -21,  87, -38,  85, -53,  82, -66,  78, -77,  74, -84,  68, -88,
@@ -371,7 +371,7 @@ ALIGNED(32) const int16_t  fast_forward_dst7_b32_coeff[1024] = {
 -82,  34,  77, -30, -68,  26,  60, -21, -50,  17,  38, -13, -26,   9,  13,  -4,
 };
 
-ALIGNED(32) const int16_t  fast_forward_dct8_b32_coeff[1024] = {
+ALIGNED(32) static const int16_t  fast_forward_dct8_b32_coeff[1024] = {
  90,  90,  90,  87,  89,  84,  88,  78,  87,  72,  86,  63,  85,  53,  84,  42,  // 0
  82,  30,  80,  17,  78,   4,  77,  -9,  74, -21,  72, -34,  68, -46,  66, -56,
  63, -66,  60, -74,  56, -80,  53, -85,  50, -88,  46, -90,  42, -89,  38, -86,
@@ -439,7 +439,7 @@ ALIGNED(32) const int16_t  fast_forward_dct8_b32_coeff[1024] = {
 };
 
 // Coeff arrays for inverse B32
-ALIGNED(32) const int16_t  fast_inverse_dct2_b32_coeff[1024] = {
+ALIGNED(32) static const int16_t  fast_inverse_dct2_b32_coeff[1024] = {
    64,  90,  64,  90,  64,  88,  64,  85,  64,  82,  64,  78,  64,  73,  64,  67,  // 0
  64,  61,  64,  54,  64,  46,  64,  38,  64,  31,  64,  22,  64,  13,  64,   4,
  64,  -4,  64, -13,  64, -22,  64, -31,  64, -38,  64, -46,  64, -54,  64, -61,
@@ -506,7 +506,7 @@ ALIGNED(32) const int16_t  fast_inverse_dct2_b32_coeff[1024] = {
 -90,  61,  87, -54, -80,  46,  70, -38, -57,  31,  43, -22, -25,  13,   9,  -4,
 };
 
-ALIGNED(32) const int16_t  fast_inverse_dst7_b32_coeff[1024] = {
+ALIGNED(32) static const int16_t  fast_inverse_dst7_b32_coeff[1024] = {
   4,  13,   9,  26,  13,  38,  17,  50,  21,  60,  26,  68,  30,  77,  34,  82,  // 0
  38,  86,  42,  89,  46,  90,  50,  88,  53,  85,  56,  80,  60,  74,  63,  66,
  66,  56,  68,  46,  72,  34,  74,  21,  77,   9,  78,  -4,  80, -17,  82, -30,
@@ -573,7 +573,7 @@ ALIGNED(32) const int16_t  fast_inverse_dst7_b32_coeff[1024] = {
 -89,  60,  85, -53, -78,  46,  68, -38, -56,  30,  42, -21, -26,  13,   9,  -4,
 };
 
-            const int16_t* fast_inverse_dct8_b32_coeff = fast_forward_dct8_b32_coeff;
+static const int16_t* fast_inverse_dct8_b32_coeff = fast_forward_dct8_b32_coeff;
 
 
 // Shuffle tables for advanced and optimized avx2 functions
@@ -582,7 +582,7 @@ ALIGNED(32) const int16_t  fast_inverse_dst7_b32_coeff[1024] = {
 // _mm256_shuffle_epi8
 // Input  [0 1 2 3 4 5 6 7 | XX
 // Output [0 4 1 5 2 6 3 7 | XX
-ALIGNED(32) const int8_t shuffle_16b_0415[32] = {
+ALIGNED(32) static const int8_t shuffle_16b_0415[32] = {
   0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
   0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
 };
@@ -591,7 +591,7 @@ ALIGNED(32) const int8_t shuffle_16b_0415[32] = {
 // _mm256_shuffle_epi8
 // Input  [0 1 2 3 4 5 6 7 |
 // Output [0 2 4 6 1 3 5 7 |
-ALIGNED(32) const int8_t shuffle_16b_0246[32] = {
+ALIGNED(32) static const int8_t shuffle_16b_0246[32] = {
   0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
   0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
 };
@@ -600,117 +600,117 @@ ALIGNED(32) const int8_t shuffle_16b_0246[32] = {
 // _mm256_permutevar8x32_epi32
 // Input  [0 1 2 3 | 4 5 6 7]
 // Output [0 1 4 5 | 2 6 3 7]
-ALIGNED(32) const int32_t permute_32b_0415[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
+ALIGNED(32) static const int32_t permute_32b_0415[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
 
 
-            const int8_t* fi_tr_2x8_shuffle_hor = shuffle_16b_0415;
+          static const int8_t* fi_tr_2x8_shuffle_hor = shuffle_16b_0415;
 
-ALIGNED(32) const int8_t  fi_tr_2x8_result_shuffle1_ver[32] = {
+ALIGNED(32) static const int8_t  fi_tr_2x8_result_shuffle1_ver[32] = {
  0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
  0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
 };
 
-ALIGNED(32) const int8_t  ff_dct2_2x8_shuffle_ver[32] = {
+ALIGNED(32) static const int8_t  ff_dct2_2x8_shuffle_ver[32] = {
    0,  1,  4,  5,  2,  3,  6,  7,  8,  9, 12, 13, 10, 11, 14, 15,
   16, 17, 20, 21, 18, 19, 22, 23, 24, 25, 28, 29, 26, 27, 30, 31
 };
 
-ALIGNED(32) const int8_t  ff_dct2_2x8_result_shuffle_ver[32] = {
+ALIGNED(32) static const int8_t  ff_dct2_2x8_result_shuffle_ver[32] = {
    0,  1,  4,  5,  2,  3,  6,  7,  8,  9, 12, 13, 10, 11, 14, 15,
   16, 17, 20, 21, 18, 19, 22, 23, 24, 25, 28, 29, 26, 27, 30, 31
 };
 
-ALIGNED(32) const int8_t  fi_tr_2x8_result_shuffle2_ver[32] = {
+ALIGNED(32) static const int8_t  fi_tr_2x8_result_shuffle2_ver[32] = {
  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
 };
 
-ALIGNED(32) const int8_t  ff_dct2_2x16_ver_result_shuffle[32] = {
+ALIGNED(32) static const int8_t  ff_dct2_2x16_ver_result_shuffle[32] = {
    0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15,
    0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15,
 };
 
-ALIGNED(32) const int8_t  fi_tr_4x4_shuffle_hor[32] = {
+ALIGNED(32) static const int8_t  fi_tr_4x4_shuffle_hor[32] = {
   0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
   0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
 };
 
-ALIGNED(32) const int8_t  fi_tr_4x4_result_shuffle_ver[32] = {
+ALIGNED(32) static const int8_t  fi_tr_4x4_result_shuffle_ver[32] = {
   0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
   0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
 };
 
-ALIGNED(32) const int8_t  fi_tr_4x8_result_shuffle_ver[32] = {
+ALIGNED(32) static const int8_t  fi_tr_4x8_result_shuffle_ver[32] = {
   0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
   0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
 };
 
-ALIGNED(32) const int8_t  ff_dct2_8x2_ver_pass_shuffle[32] = {
+ALIGNED(32) static const int8_t  ff_dct2_8x2_ver_pass_shuffle[32] = {
    0,  1,  8,  9,  2,  3, 10, 11, 4,  5, 12, 13,  6,  7, 14, 15,
    0,  1,  8,  9,  2,  3, 10, 11, 4,  5, 12, 13,  6,  7, 14, 15
 };
 
-ALIGNED(32) const int8_t  fi_tr_8x2_shuffle_hor[32] = {
+ALIGNED(32) static const int8_t  fi_tr_8x2_shuffle_hor[32] = {
   0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
   0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
 };
 
-ALIGNED(32) const int8_t  fi_tr_8x2_shuffle_ver[32] = {
+ALIGNED(32) static const int8_t  fi_tr_8x2_shuffle_ver[32] = {
   0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
   0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
 };
 
-            const int8_t* fi_tr_8x2_res_shuffle_ver = shuffle_16b_0415;
+          static const int8_t* fi_tr_8x2_res_shuffle_ver = shuffle_16b_0415;
 
-ALIGNED(32) const int8_t  ff_dct2_8x4_ver_pass_shuffle[32] = {
+ALIGNED(32) static const int8_t  ff_dct2_8x4_ver_pass_shuffle[32] = {
    0,  1,  8,  9,  4,  5, 12, 13, 2,  3, 10, 11,  6,  7, 14, 15,
    0,  1,  8,  9,  4,  5, 12, 13, 2,  3, 10, 11,  6,  7, 14, 15,
 };
 
 // TODO: remove duplicate tables. Rename with a more descriptive name.
-ALIGNED(32) const int8_t  ff_dct2_8x4_ver_pass_result_shuffle[32] = {
+ALIGNED(32) static const int8_t  ff_dct2_8x4_ver_pass_result_shuffle[32] = {
    0,  1,  4,  5,  2,  3,  6,  7, 8,  9, 12, 13, 10, 11, 14, 15,
    0,  1,  4,  5,  2,  3,  6,  7, 8,  9, 12, 13, 10, 11, 14, 15,
 };
 
-ALIGNED(32) const int8_t  ff_dct2_8x16_butterfly_shuffle[32] = {
+ALIGNED(32) static const int8_t  ff_dct2_8x16_butterfly_shuffle[32] = {
    0,  1, 14, 15,  2,  3, 12, 13,  4,  5, 10, 11,  6,  7,  8,  9,
   16, 17, 30, 31, 18, 19, 28, 29, 20, 21, 26, 27, 22, 23, 24, 25
 };
 
-ALIGNED(32) const int8_t  ff_dct2_8x16_butterfly_shuffle_order[32] = {
+ALIGNED(32) static const int8_t  ff_dct2_8x16_butterfly_shuffle_order[32] = {
    0,  1,  4,  5,  2,  3,  6,  7,  8,  9, 12, 13, 10, 11, 14, 15,
   16, 17, 20, 21, 18, 19, 22, 23, 24, 25, 28, 29, 26, 27, 30, 31
 };
 
 // Arrange samples into butterfly formation
-ALIGNED(32) const int8_t  ff_dct2_16x8_butterfly_shuffle[32] = {
+ALIGNED(32) static const int8_t  ff_dct2_16x8_butterfly_shuffle[32] = {
    0,  1, 14, 15,  2,  3, 12, 13,  4,  5, 10, 11,  6,  7,  8,  9,
   16, 17, 30, 31, 18, 19, 28, 29, 20, 21, 26, 27, 22, 23, 24, 25
 };
 
 // Swap two middle 16-bit values in each 64-bit chunk
-ALIGNED(32) const int8_t  ff_dct2_16x8_butterfly_res_shuffle_ver[32] = {
+ALIGNED(32) static const int8_t  ff_dct2_16x8_butterfly_res_shuffle_ver[32] = {
    0,  1,  4,  5,  2,  3,  6,  7,  8,  9, 12, 13, 10, 11, 14, 15,
   16, 17, 20, 21, 18, 19, 22, 23, 24, 25, 28, 29, 26, 27, 30, 31
 };
 
-ALIGNED(32) const int8_t  ff_dct2_16x32_reverse_64b_order[32] = {
+ALIGNED(32) static const int8_t  ff_dct2_16x32_reverse_64b_order[32] = {
   6,  7,  4,  5,  2,  3,  0,  1,  14, 15, 12, 13, 10, 11,  8,  9,
   22, 23, 20, 21, 18, 19, 16, 17, 30, 31, 28, 29, 26, 27, 24, 25,
 };
 
-ALIGNED(32) const int8_t  ff_dct2_32x2_butterfly_order_shuffle[32] = {
+ALIGNED(32) static const int8_t  ff_dct2_32x2_butterfly_order_shuffle[32] = {
   14, 15, 12, 13, 10, 11,  8,  9,  6,  7,  4,  5,  2,  3,  0,  1,
   30, 31, 28, 29, 26, 27, 24, 25, 22, 23, 20, 21, 18, 19, 16, 17
 };
 
-ALIGNED(32) const int8_t  ff_dct2_32x8_shuffle_order[32] = {
+ALIGNED(32) static const int8_t  ff_dct2_32x8_shuffle_order[32] = {
    0,  1, 14, 15,  2,  3, 12, 13,  4,  5, 10, 11,  6,  7,  8,  9,
   16, 17, 30, 31, 18, 19, 28, 29, 20, 21, 26, 27, 22, 23, 24, 25
 };
 
-ALIGNED(32) const int8_t  ff_dct2_32x8_shuffle_result[32] = {
+ALIGNED(32) static const int8_t  ff_dct2_32x8_shuffle_result[32] = {
    0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15,
   16, 17, 24, 25, 18, 19, 26, 27, 20, 21, 28, 29, 22, 23, 30, 31
 };
@@ -719,12 +719,12 @@ ALIGNED(32) const int8_t  ff_dct2_32x8_shuffle_result[32] = {
 // Coeff tables for advanced and optimized avx2 functions
 
 // 2xN
-ALIGNED(32) const int16_t  ff_dct2_2xN_coeff_hor[32] = {
+ALIGNED(32) static const int16_t  ff_dct2_2xN_coeff_hor[32] = {
  64,  64, 64,  64, 64,  64, 64,  64, 64,  64, 64,  64, 64,  64, 64,  64,
  64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64
 };
 
-ALIGNED(32) const int16_t  ff_dct2_2x8_coeff_ver[128] = {
+ALIGNED(32) static const int16_t  ff_dct2_2x8_coeff_ver[128] = {
  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,
  89,  75,  50,  18, -18, -50, -75, -89,  89,  75,  50,  18, -18, -50, -75, -89,
  83,  36, -36, -83, -83, -36,  36,  83,  83,  36, -36, -83, -83, -36,  36,  83,
@@ -735,7 +735,7 @@ ALIGNED(32) const int16_t  ff_dct2_2x8_coeff_ver[128] = {
  18, -50,  75, -89,  89, -75,  50, -18,  18, -50,  75, -89,  89, -75,  50, -18
 };
 
-ALIGNED(32)
+ALIGNED(32) static
 const int16_t ff_dst7_2x8_coeff_ver[128] = {
   17, 32,  46,  60,  71,  78,  85,  86,  17, 32,  46,  60,  71,  78,  85,  86,
   46, 78,  86,  71,  32,  -17, -60, -85, 46, 78,  86,  71,  32,  -17, -60, -85,
@@ -748,7 +748,7 @@ const int16_t ff_dst7_2x8_coeff_ver[128] = {
 };
 
 
-ALIGNED(32) const int16_t  fi_dct2_2x8_coeff_ver[128] = {
+ALIGNED(32) static const int16_t  fi_dct2_2x8_coeff_ver[128] = {
  64,  89,  83,  75,  64,  75,  36, -18,  64,  89,  83,  75,  64,  75,  36, -18,
  64,  50,  36,  18, -64, -89, -83, -50,  64,  50,  36,  18, -64, -89, -83, -50,
  64,  50, -36, -89,  64,  18, -83, -50,  64,  50, -36, -89,  64,  18, -83, -50,
@@ -759,7 +759,7 @@ ALIGNED(32) const int16_t  fi_dct2_2x8_coeff_ver[128] = {
 -64,  89, -83,  50,  64, -50,  36, -18, -64,  89, -83,  50,  64, -50,  36, -18,
 };
 
-ALIGNED(32) const int16_t  fi_dst7_2x8_coeff_ver[128] = {
+ALIGNED(32) static const int16_t  fi_dst7_2x8_coeff_ver[128] = {
  17,  46,  71,  85,  32,  78,  85,  46,  17,  46,  71,  85,  32,  78,  85,  46,
  86,  78,  60,  32, -17, -71, -86, -60,  86,  78,  60,  32, -17, -71, -86, -60,
  46,  86,  32, -60,  60,  71, -46, -78,  46,  86,  32, -60,  60,  71, -46, -78,
@@ -770,7 +770,7 @@ ALIGNED(32) const int16_t  fi_dst7_2x8_coeff_ver[128] = {
 -71,  86, -78,  46,  60, -46,  32, -17, -71,  86, -78,  46,  60, -46,  32, -17,
 };
 
-ALIGNED(32) const int16_t  fi_dct8_2x8_coeff_ver[128] = {
+ALIGNED(32) static const int16_t  fi_dct8_2x8_coeff_ver[128] = {
  86,  85,  78,  71,  85,  60,  17, -32,  86,  85,  78,  71,  85,  60,  17, -32,
  60,  46,  32,  17, -71, -86, -78, -46,  60,  46,  32,  17, -71, -86, -78, -46,
  78,  17, -60, -86,  71, -32, -86, -17,  78,  17, -60, -86,  71, -32, -86, -17,
@@ -783,7 +783,7 @@ ALIGNED(32) const int16_t  fi_dct8_2x8_coeff_ver[128] = {
 
 
 
-ALIGNED(32) const int16_t  fi_dct2_2x16_coeff_ver[512] = {
+ALIGNED(32) static const int16_t  fi_dct2_2x16_coeff_ver[512] = {
  64,  90,  89,  87,  64,  90,  89,  87,  64,  57,  50,  43,  64,  57,  50,  43,  // 0
  83,  80,  75,  70,  83,  80,  75,  70,  36,  25,  18,   9,  36,  25,  18,   9,
  64,  87,  75,  57,  64,  87,  75,  57, -64, -80, -89, -90, -64, -80, -89, -90,
@@ -818,7 +818,7 @@ ALIGNED(32) const int16_t  fi_dct2_2x16_coeff_ver[512] = {
  83, -80,  75, -70,  83, -80,  75, -70,  36, -25,  18,  -9,  36, -25,  18,  -9,
 };
 
-ALIGNED(32) const int16_t fi_dst7_2x16_coeff_ver[512] = {
+ALIGNED(32) static const int16_t fi_dst7_2x16_coeff_ver[512] = {
   8,  25,  40,  55,   8,  25,  40,  55,  88,  87,  81,  73,  88,  87,  81,  73,  // 0
  68,  77,  85,  88,  68,  77,  85,  88,  62,  48,  33,  17,  62,  48,  33,  17, 
  17,  48,  73,  87,  17,  48,  73,  87,  -8, -40, -68, -85,  -8, -40, -68, -85, 
@@ -853,7 +853,7 @@ ALIGNED(32) const int16_t fi_dst7_2x16_coeff_ver[512] = {
  81, -77,  73, -68,  81, -77,  73, -68,  33, -25,  17,  -8,  33, -25,  17,  -8, 
 };
 
-ALIGNED(32) const int16_t  fi_dct2_2x32_coeff_ver[2048] = {
+ALIGNED(32) static const int16_t  fi_dct2_2x32_coeff_ver[2048] = {
  64,  90,  90,  90,  89,  88,  87,  85,  64,  90,  90,  90,  89,  88,  87,  85,  // 0
  83,  82,  80,  78,  75,  73,  70,  67,  83,  82,  80,  78,  75,  73,  70,  67,
  64,  61,  57,  54,  50,  46,  43,  38,  64,  61,  57,  54,  50,  46,  43,  38,
@@ -986,7 +986,7 @@ ALIGNED(32) const int16_t  fi_dct2_2x32_coeff_ver[2048] = {
 
 
 // 4xN
-ALIGNED(32) const int16_t  ff_dct2_4x8_coeff_ver[256] = {
+ALIGNED(32) static const int16_t  ff_dct2_4x8_coeff_ver[256] = {
  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  // 0
  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,
  89,  75,  50,  18,  89,  75,  50,  18,  89,  75,  50,  18,  89,  75,  50,  18,
@@ -1005,7 +1005,7 @@ ALIGNED(32) const int16_t  ff_dct2_4x8_coeff_ver[256] = {
  89, -75,  50, -18,  89, -75,  50, -18,  89, -75,  50, -18,  89, -75,  50, -18,
 };
 
-ALIGNED(32) const int16_t  ff_dst7_4x8_coeff_ver[256] = {
+ALIGNED(32) static const int16_t  ff_dst7_4x8_coeff_ver[256] = {
  17,  32,  46,  60,  17,  32,  46,  60,  17,  32,  46,  60,  17,  32,  46,  60,  // 0
  71,  78,  85,  86,  71,  78,  85,  86,  71,  78,  85,  86,  71,  78,  85,  86,
  46,  78,  86,  71,  46,  78,  86,  71,  46,  78,  86,  71,  46,  78,  86,  71,
@@ -1024,7 +1024,7 @@ ALIGNED(32) const int16_t  ff_dst7_4x8_coeff_ver[256] = {
  85, -71,  46, -17,  85, -71,  46, -17,  85, -71,  46, -17,  85, -71,  46, -17,
 };
 
-ALIGNED(32) const int16_t  ff_dct8_4x8_coeff_ver[256] = {
+ALIGNED(32) static const int16_t  ff_dct8_4x8_coeff_ver[256] = {
  86,  85,  78,  71,  86,  85,  78,  71,  86,  85,  78,  71,  86,  85,  78,  71,  // 0
  60,  46,  32,  17,  60,  46,  32,  17,  60,  46,  32,  17,  60,  46,  32,  17,
  85,  60,  17, -32,  85,  60,  17, -32,  85,  60,  17, -32,  85,  60,  17, -32,
@@ -1044,21 +1044,21 @@ ALIGNED(32) const int16_t  ff_dct8_4x8_coeff_ver[256] = {
 };
 
 
-ALIGNED(32) const int16_t  fi_dct2_4xN_coeff_hor[64] = {
+ALIGNED(32) static const int16_t  fi_dct2_4xN_coeff_hor[64] = {
  64,  83,  64,  36,  64,  83,  64,  36,  64,  83,  64,  36,  64,  83,  64,  36,
  64,  36, -64, -83,  64,  36, -64, -83,  64,  36, -64, -83,  64,  36, -64, -83,
  64, -36, -64,  83,  64, -36, -64,  83,  64, -36, -64,  83,  64, -36, -64,  83,
  64, -83,  64, -36,  64, -83,  64, -36,  64, -83,  64, -36,  64, -83,  64, -36,
 };
 
-ALIGNED(32) const int16_t  fi_dst7_4xN_coeff_hor[64] = {
+ALIGNED(32) static const int16_t  fi_dst7_4xN_coeff_hor[64] = {
  29,  74,  84,  55,  29,  74,  84,  55,  29,  74,  84,  55,  29,  74,  84,  55,
  55,  74, -29, -84,  55,  74, -29, -84,  55,  74, -29, -84,  55,  74, -29, -84,
  74,   0, -74,  74,  74,   0, -74,  74,  74,   0, -74,  74,  74,   0, -74,  74,
  84, -74,  55, -29,  84, -74,  55, -29,  84, -74,  55, -29,  84, -74,  55, -29,
 };
 
-ALIGNED(32) const int16_t  fi_dct8_4xN_coeff_hor[64] = { // TODO: this is probably identical to forward table, remove this if unnecessary
+ALIGNED(32) static const int16_t  fi_dct8_4xN_coeff_hor[64] = { // TODO: this is probably identical to forward table, remove this if unnecessary
  84,  74,  55,  29,  84,  74,  55,  29,  84,  74,  55,  29,  84,  74,  55,  29,
  74,   0, -74, -74,  74,   0, -74, -74,  74,   0, -74, -74,  74,   0, -74, -74,
  55, -74, -29,  84,  55, -74, -29,  84,  55, -74, -29,  84,  55, -74, -29,  84,
@@ -1066,7 +1066,7 @@ ALIGNED(32) const int16_t  fi_dct8_4xN_coeff_hor[64] = { // TODO: this is probab
 };
 
 
-ALIGNED(32) const int16_t  fi_dct2_4x8_coeff_hor[128] = {
+ALIGNED(32) static const int16_t  fi_dct2_4x8_coeff_hor[128] = {
  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,
  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,
  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,
@@ -1077,7 +1077,7 @@ ALIGNED(32) const int16_t  fi_dct2_4x8_coeff_hor[128] = {
 -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36,
 };
 
-ALIGNED(32) const int16_t  fi_dst7_4x8_coeff_hor[128] = {
+ALIGNED(32) static const int16_t  fi_dst7_4x8_coeff_hor[128] = {
  29,  84,  29,  84,  29,  84,  29,  84,  29,  84,  29,  84,  29,  84,  29,  84,
  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,
  74, -74,  74, -74,  74, -74,  74, -74,  74, -74,  74, -74,  74, -74,  74, -74,
@@ -1088,7 +1088,7 @@ ALIGNED(32) const int16_t  fi_dst7_4x8_coeff_hor[128] = {
 -74, -29, -74, -29, -74, -29, -74, -29, -74, -29, -74, -29, -74, -29, -74, -29,
 };
 
-ALIGNED(32) const int16_t  fi_dct8_4x8_coeff_hor[128] = { // TODO: this is probably identical to forward table, remove this if unnecessary
+ALIGNED(32) static const int16_t  fi_dct8_4x8_coeff_hor[128] = { // TODO: this is probably identical to forward table, remove this if unnecessary
  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,
  74, -74,  74, -74,  74, -74,  74, -74,  74, -74,  74, -74,  74, -74,  74, -74,
  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,
@@ -1100,7 +1100,7 @@ ALIGNED(32) const int16_t  fi_dct8_4x8_coeff_hor[128] = { // TODO: this is proba
 };
 
 
-ALIGNED(32) const int16_t  fi_dct2_4x8_coeff_ver[256] = {
+ALIGNED(32) static const int16_t  fi_dct2_4x8_coeff_ver[256] = {
  64,  89,  83,  75,  64,  89,  83,  75,  64,  89,  83,  75,  64,  89,  83,  75,  // 0
  64,  50,  36,  18,  64,  50,  36,  18,  64,  50,  36,  18,  64,  50,  36,  18,
  64,  75,  36, -18,  64,  75,  36, -18,  64,  75,  36, -18,  64,  75,  36, -18,
@@ -1119,7 +1119,7 @@ ALIGNED(32) const int16_t  fi_dct2_4x8_coeff_ver[256] = {
  64, -50,  36, -18,  64, -50,  36, -18,  64, -50,  36, -18,  64, -50,  36, -18,
 };
 
-ALIGNED(32) const int16_t  fi_dst7_4x8_coeff_ver[256] = {
+ALIGNED(32) static const int16_t  fi_dst7_4x8_coeff_ver[256] = {
  17,  46,  71,  85,  17,  46,  71,  85,  17,  46,  71,  85,  17,  46,  71,  85,  // 0
  86,  78,  60,  32,  86,  78,  60,  32,  86,  78,  60,  32,  86,  78,  60,  32,
  32,  78,  85,  46,  32,  78,  85,  46,  32,  78,  85,  46,  32,  78,  85,  46,
@@ -1138,7 +1138,7 @@ ALIGNED(32) const int16_t  fi_dst7_4x8_coeff_ver[256] = {
  60, -46,  32, -17,  60, -46,  32, -17,  60, -46,  32, -17,  60, -46,  32, -17,
 };
 
-ALIGNED(32) const int16_t  fi_dct8_4x8_coeff_ver[256] = { // TODO: this is probably identical to forward table, remove this if unnecessary
+ALIGNED(32) static const int16_t  fi_dct8_4x8_coeff_ver[256] = { // TODO: this is probably identical to forward table, remove this if unnecessary
  86,  85,  78,  71,  86,  85,  78,  71,  86,  85,  78,  71,  86,  85,  78,  71,  // 0
  60,  46,  32,  17,  60,  46,  32,  17,  60,  46,  32,  17,  60,  46,  32,  17,
  85,  60,  17, -32,  85,  60,  17, -32,  85,  60,  17, -32,  85,  60,  17, -32,
@@ -1158,7 +1158,7 @@ ALIGNED(32) const int16_t  fi_dct8_4x8_coeff_ver[256] = { // TODO: this is proba
 };
 
 
-ALIGNED(32) const int16_t  fi_dct2_4x16_coeff_hor[128] = {
+ALIGNED(32) static const int16_t  fi_dct2_4x16_coeff_hor[128] = {
  64,  83,  64,  83,  64,  83,  64,  83,  64,  83,  64,  83,  64,  83,  64,  83,
  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,
  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,
@@ -1169,7 +1169,7 @@ ALIGNED(32) const int16_t  fi_dct2_4x16_coeff_hor[128] = {
  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,
 };
 
-ALIGNED(32) const int16_t  fi_dst7_4x16_coeff_hor[128] = {
+ALIGNED(32) static const int16_t  fi_dst7_4x16_coeff_hor[128] = {
  29,  74,  29,  74,  29,  74,  29,  74,  29,  74,  29,  74,  29,  74,  29,  74,
  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,
  55,  74,  55,  74,  55,  74,  55,  74,  55,  74,  55,  74,  55,  74,  55,  74,
@@ -1180,7 +1180,7 @@ ALIGNED(32) const int16_t  fi_dst7_4x16_coeff_hor[128] = {
  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,
 };
 
-ALIGNED(32) const int16_t  fi_dct8_4x16_coeff_hor[128] = { // TODO: this is probably identical to forward table, remove this if unnecessary
+ALIGNED(32) static const int16_t  fi_dct8_4x16_coeff_hor[128] = { // TODO: this is probably identical to forward table, remove this if unnecessary
  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,
  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,
  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,
@@ -1192,7 +1192,7 @@ ALIGNED(32) const int16_t  fi_dct8_4x16_coeff_hor[128] = { // TODO: this is prob
 };
 
 
-ALIGNED(32) const int16_t  fi_dct2_4x16_coeff_ver[512] = {
+ALIGNED(32) static const int16_t  fi_dct2_4x16_coeff_ver[512] = {
  64,  90,  89,  87,  83,  80,  75,  70,  64,  90,  89,  87,  83,  80,  75,  70,  // 0
  64,  57,  50,  43,  36,  25,  18,   9,  64,  57,  50,  43,  36,  25,  18,   9,
  64,  87,  75,  57,  36,   9, -18, -43,  64,  87,  75,  57,  36,   9, -18, -43,
@@ -1227,7 +1227,7 @@ ALIGNED(32) const int16_t  fi_dct2_4x16_coeff_ver[512] = {
  64, -57,  50, -43,  36, -25,  18,  -9,  64, -57,  50, -43,  36, -25,  18,  -9,
 };
 
-ALIGNED(32) const int16_t  fi_dst7_4x16_coeff_ver[512] = {
+ALIGNED(32) static const int16_t  fi_dst7_4x16_coeff_ver[512] = {
   8,  25,  40,  55,  68,  77,  85,  88,   8,  25,  40,  55,  68,  77,  85,  88,  // 0
  88,  87,  81,  73,  62,  48,  33,  17,  88,  87,  81,  73,  62,  48,  33,  17,
  17,  48,  73,  87,  88,  77,  55,  25,  17,  48,  73,  87,  88,  77,  55,  25,
@@ -1262,7 +1262,7 @@ ALIGNED(32) const int16_t  fi_dst7_4x16_coeff_ver[512] = {
  62, -55,  48, -40,  33, -25,  17,  -8,  62, -55,  48, -40,  33, -25,  17,  -8,
 };
 
-ALIGNED(32) const int16_t  fi_dct8_4x16_coeff_ver[512] = {
+ALIGNED(32) static const int16_t  fi_dct8_4x16_coeff_ver[512] = {
  88,  88,  87,  85,  81,  77,  73,  68,  88,  88,  87,  85,  81,  77,  73,  68,  // 0
  62,  55,  48,  40,  33,  25,  17,   8,  62,  55,  48,  40,  33,  25,  17,   8,
  88,  81,  68,  48,  25,   0, -25, -48,  88,  81,  68,  48,  25,   0, -25, -48,
@@ -1298,7 +1298,7 @@ ALIGNED(32) const int16_t  fi_dct8_4x16_coeff_ver[512] = {
 };
 
 
-ALIGNED(32) const int16_t  fi_dct2_4x32_coeff_hor[128] = {
+ALIGNED(32) static const int16_t  fi_dct2_4x32_coeff_hor[128] = {
  64,  83,  64,  83,  64,  83,  64,  83,  64,  83,  64,  83,  64,  83,  64,  83,
  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,
  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,
@@ -1309,7 +1309,7 @@ ALIGNED(32) const int16_t  fi_dct2_4x32_coeff_hor[128] = {
  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,
 };
 
-ALIGNED(32) const int16_t  fi_dst7_4x32_coeff_hor[128] = {
+ALIGNED(32) static const int16_t  fi_dst7_4x32_coeff_hor[128] = {
  29,  74,  29,  74,  29,  74,  29,  74,  29,  74,  29,  74,  29,  74,  29,  74,
  55,  74,  55,  74,  55,  74,  55,  74,  55,  74,  55,  74,  55,  74,  55,  74,
  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,
@@ -1320,7 +1320,7 @@ ALIGNED(32) const int16_t  fi_dst7_4x32_coeff_hor[128] = {
  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,
 };
 
-ALIGNED(32) const int16_t  fi_dct8_4x32_coeff_hor[128] = { 
+ALIGNED(32) static const int16_t  fi_dct8_4x32_coeff_hor[128] = { 
  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,
  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,
  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,
@@ -1333,7 +1333,7 @@ ALIGNED(32) const int16_t  fi_dct8_4x32_coeff_hor[128] = {
 
 
 // 8xN
-ALIGNED(32) const int16_t  ff_dct2_8xN_coeff_hor[128] = {
+ALIGNED(32) static const int16_t  ff_dct2_8xN_coeff_hor[128] = {
  64,  64,  89,  75,  83,  36,  75, -18,  64,  64,  89,  75,  83,  36,  75, -18,
  64,  64,  50,  18, -36, -83, -89, -50,  64,  64,  50,  18, -36, -83, -89, -50,
  64,  64, -18, -50, -83, -36,  50,  89,  64,  64, -18, -50, -83, -36,  50,  89,
@@ -1344,7 +1344,7 @@ ALIGNED(32) const int16_t  ff_dct2_8xN_coeff_hor[128] = {
 -64,  64,  89, -50, -83,  36,  50, -18, -64,  64,  89, -50, -83,  36,  50, -18
 };
 
-ALIGNED(32) const int16_t  ff_dst7_8xN_coeff_hor[128] = {
+ALIGNED(32) static const int16_t  ff_dst7_8xN_coeff_hor[128] = {
  17,  32,  46,  78,  71,  85,  85,  46,  17,  32,  46,  78,  71,  85,  85,  46,
  46,  60,  86,  71,  32, -46, -60, -78,  46,  60,  86,  71,  32, -46, -60, -78,
  71,  78,  32, -17, -86, -60,  17,  86,  71,  78,  32, -17, -86, -60,  17,  86,
@@ -1355,7 +1355,7 @@ ALIGNED(32) const int16_t  ff_dst7_8xN_coeff_hor[128] = {
 -71,  60,  86, -46, -78,  32,  46, -17, -71,  60,  86, -46, -78,  32,  46, -17,
 };
 
-ALIGNED(32) const int16_t  ff_dct8_8xN_coeff_hor[128] = {
+ALIGNED(32) static const int16_t  ff_dct8_8xN_coeff_hor[128] = {
  86,  85,  85,  60,  78,  17,  71, -32,  86,  85,  85,  60,  78,  17,  71, -32,
  78,  71,  17, -32, -60, -86, -86, -17,  78,  71,  17, -32, -60, -86, -86, -17,
  60,  46, -71, -86, -46,  32,  78,  60,  60,  46, -71, -86, -46,  32,  78,  60,
@@ -1367,10 +1367,10 @@ ALIGNED(32) const int16_t  ff_dct8_8xN_coeff_hor[128] = {
 };
 
 
-            const int16_t* ff_dct2_8x2_coeff_ver = ff_dct2_2xN_coeff_hor; // This is identical to existing table
+          static const int16_t* ff_dct2_8x2_coeff_ver = ff_dct2_2xN_coeff_hor; // This is identical to existing table
 
 
-ALIGNED(32) const int16_t  fi_dct2_8x2_coeff_hor[128] = {
+ALIGNED(32) static const int16_t  fi_dct2_8x2_coeff_hor[128] = {
  64,  89,  83,  75,  64,  50,  36,  18,  64,  89,  83,  75,  64,  50,  36,  18,
  64,  75,  36, -18, -64, -89, -83, -50,  64,  75,  36, -18, -64, -89, -83, -50,
  64,  50, -36, -89, -64,  18,  83,  75,  64,  50, -36, -89, -64,  18,  83,  75,
@@ -1381,7 +1381,7 @@ ALIGNED(32) const int16_t  fi_dct2_8x2_coeff_hor[128] = {
  64, -89,  83, -75,  64, -50,  36, -18,  64, -89,  83, -75,  64, -50,  36, -18,
 };
 
-ALIGNED(32) const int16_t  fi_dst7_8x2_coeff_hor[128] = {
+ALIGNED(32) static const int16_t  fi_dst7_8x2_coeff_hor[128] = {
   17,  46,  71,  85,  86,  78,  60,  32, 17,  46,  71,  85,  86,  78,  60,  32,
   32,  78,  85,  46, -17, -71, -86, -60, 32,  78,  85,  46, -17, -71, -86, -60,
   46,  86,  32, -60, -85, -17,  71,  78, 46,  86,  32, -60, -85, -17,  71,  78,
@@ -1392,10 +1392,10 @@ ALIGNED(32) const int16_t  fi_dst7_8x2_coeff_hor[128] = {
   86, -85,  78, -71,  60, -46,  32, -17, 86, -85,  78, -71,  60, -46,  32, -17,
 };
 
-            const int16_t* fi_dct2_8x2_coeff_ver = ff_dct2_2xN_coeff_hor; // This is identical to existing table
+          static const int16_t* fi_dct2_8x2_coeff_ver = ff_dct2_2xN_coeff_hor; // This is identical to existing table
 
 
-ALIGNED(32) const int16_t  ff_dct2_8x4_coeff_ver[128] = {
+ALIGNED(32) static const int16_t  ff_dct2_8x4_coeff_ver[128] = {
  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,
  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,
  83,  36,  83,  36,  83,  36,  83,  36,  83,  36,  83,  36,  83,  36,  83,  36,
@@ -1406,7 +1406,7 @@ ALIGNED(32) const int16_t  ff_dct2_8x4_coeff_ver[128] = {
  83, -36,  83, -36,  83, -36,  83, -36,  83, -36,  83, -36,  83, -36,  83, -36,
 };
 
-ALIGNED(32) const int16_t  ff_dst7_8x4_coeff_ver[128] = {
+ALIGNED(32) static const int16_t  ff_dst7_8x4_coeff_ver[128] = {
  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,
  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,
  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,
@@ -1417,7 +1417,7 @@ ALIGNED(32) const int16_t  ff_dst7_8x4_coeff_ver[128] = {
  74, -29,  74, -29,  74, -29,  74, -29,  74, -29,  74, -29,  74, -29,  74, -29,
 };
 
-ALIGNED(32) const int16_t  ff_dct8_8x4_coeff_ver[128] = {
+ALIGNED(32) static const int16_t  ff_dct8_8x4_coeff_ver[128] = {
  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,
  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,
  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,
@@ -1429,7 +1429,7 @@ ALIGNED(32) const int16_t  ff_dct8_8x4_coeff_ver[128] = {
 };
 
 
-ALIGNED(32) const int16_t  fi_dct2_8x4_coeff_hor[256] = {
+ALIGNED(32) static const int16_t  fi_dct2_8x4_coeff_hor[256] = {
  64,  89,  83,  75,  64,  89,  83,  75,  64,  89,  83,  75,  64,  89,  83,  75,  // 0
  64,  50,  36,  18,  64,  50,  36,  18,  64,  50,  36,  18,  64,  50,  36,  18,
  64,  75,  36, -18,  64,  75,  36, -18,  64,  75,  36, -18,  64,  75,  36, -18,
@@ -1448,7 +1448,7 @@ ALIGNED(32) const int16_t  fi_dct2_8x4_coeff_hor[256] = {
  64, -50,  36, -18,  64, -50,  36, -18,  64, -50,  36, -18,  64, -50,  36, -18,
 };
 
-ALIGNED(32) const int16_t  fi_dst7_8x4_coeff_hor[256] = {
+ALIGNED(32) static const int16_t  fi_dst7_8x4_coeff_hor[256] = {
  17,  46,  71,  85,  17,  46,  71,  85,  17,  46,  71,  85,  17,  46,  71,  85,  // 0
  86,  78,  60,  32,  86,  78,  60,  32,  86,  78,  60,  32,  86,  78,  60,  32,
  32,  78,  85,  46,  32,  78,  85,  46,  32,  78,  85,  46,  32,  78,  85,  46,
@@ -1467,7 +1467,7 @@ ALIGNED(32) const int16_t  fi_dst7_8x4_coeff_hor[256] = {
  60, -46,  32, -17,  60, -46,  32, -17,  60, -46,  32, -17,  60, -46,  32, -17,
 };
 
-ALIGNED(32) const int16_t  fi_dct8_8x4_coeff_hor[256] = {
+ALIGNED(32) static const int16_t  fi_dct8_8x4_coeff_hor[256] = {
  86,  85,  78,  71,  86,  85,  78,  71,  86,  85,  78,  71,  86,  85,  78,  71,  // 0
  60,  46,  32,  17,  60,  46,  32,  17,  60,  46,  32,  17,  60,  46,  32,  17,
  85,  60,  17, -32,  85,  60,  17, -32,  85,  60,  17, -32,  85,  60,  17, -32,
@@ -1487,7 +1487,7 @@ ALIGNED(32) const int16_t  fi_dct8_8x4_coeff_hor[256] = {
 };
 
 
-ALIGNED(32) const int16_t  fi_dct2_8x4_coeff_ver[128] = {
+ALIGNED(32) static const int16_t  fi_dct2_8x4_coeff_ver[128] = {
  64,  83,  64,  83,  64,  83,  64,  83,  64,  83,  64,  83,  64,  83,  64,  83,
  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,
  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,
@@ -1498,7 +1498,7 @@ ALIGNED(32) const int16_t  fi_dct2_8x4_coeff_ver[128] = {
  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,
 };
 
-ALIGNED(32) const int16_t  fi_dst7_8x4_coeff_ver[128] = {
+ALIGNED(32) static const int16_t  fi_dst7_8x4_coeff_ver[128] = {
  29,  74,  29,  74,  29,  74,  29,  74,  29,  74,  29,  74,  29,  74,  29,  74,
  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,
  55,  74,  55,  74,  55,  74,  55,  74,  55,  74,  55,  74,  55,  74,  55,  74,
@@ -1509,24 +1509,24 @@ ALIGNED(32) const int16_t  fi_dst7_8x4_coeff_ver[128] = {
  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,
 };
 
-            const int16_t* fi_dct8_8x4_coeff_ver      = ff_dct8_8x4_coeff_ver; // Duplicate table
+          static const int16_t* fi_dct8_8x4_coeff_ver      = ff_dct8_8x4_coeff_ver; // Duplicate table
 
 
-ALIGNED(32) const int16_t  ff_dct2_8x8_coeff_ver[64] = {
+ALIGNED(32) static const int16_t  ff_dct2_8x8_coeff_ver[64] = {
  64,  64,  64,  64,  64,  64,  64,  64,  89,  50,  75,  18, -18, -75, -50, -89,
  83, -36,  36, -83, -83,  36, -36,  83,  75, -89, -18, -50,  50,  18,  89, -75,
  64, -64, -64,  64,  64, -64, -64,  64,  50,  18, -89,  75, -75,  89, -18, -50,
  36,  83, -83, -36, -36, -83,  83,  36,  18,  75, -50, -89,  89,  50, -75, -18,
 };
 
-ALIGNED(32) const int16_t  ff_dst7_8x8_coeff_ver[64] = {
+ALIGNED(32) static const int16_t  ff_dst7_8x8_coeff_ver[64] = {
  17,  46,  32,  60,  71,  85,  78,  86,  46,  86,  78,  71,  32, -60, -17, -85,
  71,  32,  85, -46, -86,  17, -60,  78,  85, -60,  46, -78,  17,  32,  86, -71,
  86, -85, -17,  32,  78, -71, -46,  60,  78, -17, -71,  85, -60,  86, -32, -46,
  60,  71, -86, -17, -46, -78,  85,  32,  32,  78, -60, -86,  85,  46, -71, -17,
 };
 
-ALIGNED(32) const int16_t  ff_dct8_8x8_coeff_ver[64] = {
+ALIGNED(32) static const int16_t  ff_dct8_8x8_coeff_ver[64] = {
  86,  78,  85,  71,  60,  32,  46,  17,  85,  17,  60, -32, -71, -78, -86, -46,
  78, -60,  17, -86, -46,  85,  32,  71,  71, -86, -32, -17,  78, -46,  60, -85,
  60, -46, -71,  78,  32, -17, -85,  86,  46,  32, -86,  60, -85,  71,  17, -78,
@@ -1534,7 +1534,7 @@ ALIGNED(32) const int16_t  ff_dct8_8x8_coeff_ver[64] = {
 };
 
 
-ALIGNED(32) const int16_t  fi_dct2_8x8_coeff_hor[512] = {
+ALIGNED(32) static const int16_t  fi_dct2_8x8_coeff_hor[512] = {
  64,  89,  64,  89,  64,  89,  64,  89,  64,  89,  64,  89,  64,  89,  64,  89,  // 0
  83,  75,  83,  75,  83,  75,  83,  75,  83,  75,  83,  75,  83,  75,  83,  75,
  64,  50,  64,  50,  64,  50,  64,  50,  64,  50,  64,  50,  64,  50,  64,  50,
@@ -1569,7 +1569,7 @@ ALIGNED(32) const int16_t  fi_dct2_8x8_coeff_hor[512] = {
  36, -18,  36, -18,  36, -18,  36, -18,  36, -18,  36, -18,  36, -18,  36, -18,
 };
 
-ALIGNED(32) const int16_t  fi_dst7_8x8_coeff_hor[512] = {
+ALIGNED(32) static const int16_t  fi_dst7_8x8_coeff_hor[512] = {
  17,  46,  17,  46,  17,  46,  17,  46,  17,  46,  17,  46,  17,  46,  17,  46,  // 0
  71,  85,  71,  85,  71,  85,  71,  85,  71,  85,  71,  85,  71,  85,  71,  85,
  86,  78,  86,  78,  86,  78,  86,  78,  86,  78,  86,  78,  86,  78,  86,  78,
@@ -1604,7 +1604,7 @@ ALIGNED(32) const int16_t  fi_dst7_8x8_coeff_hor[512] = {
  32, -17,  32, -17,  32, -17,  32, -17,  32, -17,  32, -17,  32, -17,  32, -17,
 };
 
-ALIGNED(32) const int16_t  fi_dct8_8x8_coeff_hor[512] = {
+ALIGNED(32) static const int16_t  fi_dct8_8x8_coeff_hor[512] = {
  86,  85,  86,  85,  86,  85,  86,  85,  86,  85,  86,  85,  86,  85,  86,  85,  // 0
  78,  71,  78,  71,  78,  71,  78,  71,  78,  71,  78,  71,  78,  71,  78,  71,
  60,  46,  60,  46,  60,  46,  60,  46,  60,  46,  60,  46,  60,  46,  60,  46,
@@ -1640,7 +1640,7 @@ ALIGNED(32) const int16_t  fi_dct8_8x8_coeff_hor[512] = {
 };
 
 
-ALIGNED(32) const int16_t  ff_dct2_8x16_coeff_ver[256] = {
+ALIGNED(32) static const int16_t  ff_dct2_8x16_coeff_ver[256] = {
  64,  64,  90,  87,  89,  75,  87,  57,  83,  36,  80,   9,  75, -18,  70, -43,  // 0
  64, -64,  57, -80,  50, -89,  43, -90,  36, -83,  25, -70,  18, -50,   9, -25,
  64,  64,  80,  70,  50,  18,   9, -43, -36, -83, -70, -87, -89, -50, -87,   9,
@@ -1659,7 +1659,7 @@ ALIGNED(32) const int16_t  ff_dct2_8x16_coeff_ver[256] = {
 -64,  64,  80, -57, -89,  50,  90, -43, -83,  36,  70, -25, -50,  18,  25,  -9,
 };
 
-ALIGNED(32) const int16_t  ff_dst7_8x16_coeff_ver[256] = {
+ALIGNED(32) static const int16_t  ff_dst7_8x16_coeff_ver[256] = {
   8,  17,  25,  48,  40,  73,  55,  87,  68,  88,  77,  77,  85,  55,  88,  25,  // 0
  88,  -8,  87, -40,  81, -68,  73, -85,  62, -88,  48, -81,  33, -62,  17, -33,
  25,  33,  68,  81,  88,  85,  81,  40,  48, -25,   0, -77, -48, -87, -81, -48,
@@ -1678,7 +1678,7 @@ ALIGNED(32) const int16_t  ff_dst7_8x16_coeff_ver[256] = {
 -68,  62,  81, -55, -88,  48,  88, -40, -81,  33,  68, -25, -48,  17,  25,  -8,
 };
 
-ALIGNED(32) const int16_t  ff_dct8_8x16_coeff_ver[256] = {
+ALIGNED(32) static const int16_t  ff_dct8_8x16_coeff_ver[256] = {
  88,  88,  88,  81,  87,  68,  85,  48,  81,  25,  77,   0,  73, -25,  68, -48,  // 0
  62, -68,  55, -81,  48, -88,  40, -88,  33, -81,  25, -68,  17, -48,   8, -25,
  87,  85,  68,  48,  33,  -8,  -8, -62, -48, -88, -77, -77, -88, -33, -81,  25,
@@ -1697,7 +1697,7 @@ ALIGNED(32) const int16_t  ff_dct8_8x16_coeff_ver[256] = {
  -8,  88,  40, -87, -68,  81,  85, -73, -88,  62,  81, -48, -62,  33,  33, -17,
 };
 
-ALIGNED(32) const int16_t  ff_dct2_8x16_butterfly_o_row_coeff_hor[256] = {
+ALIGNED(32) static const int16_t  ff_dct2_8x16_butterfly_o_row_coeff_hor[256] = {
  89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  // 0
  75, -75,  75, -75,  75, -75,  75, -75,  75, -75,  75, -75,  75, -75,  75, -75,
  50, -50,  50, -50,  50, -50,  50, -50,  50, -50,  50, -50,  50, -50,  50, -50,
@@ -1717,14 +1717,14 @@ ALIGNED(32) const int16_t  ff_dct2_8x16_butterfly_o_row_coeff_hor[256] = {
 };
 
 
-            const int16_t* fi_dct2_8x16_coeff_hor = fi_dct2_8x8_coeff_hor;
+          static const int16_t* fi_dct2_8x16_coeff_hor = fi_dct2_8x8_coeff_hor;
             
-            const int16_t* fi_dst7_8x16_coeff_hor = fi_dst7_8x8_coeff_hor;
+          static const int16_t* fi_dst7_8x16_coeff_hor = fi_dst7_8x8_coeff_hor;
             
-            const int16_t* fi_dct8_8x16_coeff_hor = fi_dct8_8x8_coeff_hor;
+          static const int16_t* fi_dct8_8x16_coeff_hor = fi_dct8_8x8_coeff_hor;
 
 
-ALIGNED(32) const int16_t  fi_dct2_8x16_coeff_ver[2048] = {
+ALIGNED(32) static const int16_t  fi_dct2_8x16_coeff_ver[2048] = {
  64,  90,  64,  90,  64,  90,  64,  90,  64,  90,  64,  90,  64,  90,  64,  90,  // 0
  89,  87,  89,  87,  89,  87,  89,  87,  89,  87,  89,  87,  89,  87,  89,  87,
  83,  80,  83,  80,  83,  80,  83,  80,  83,  80,  83,  80,  83,  80,  83,  80,
@@ -1855,7 +1855,7 @@ ALIGNED(32) const int16_t  fi_dct2_8x16_coeff_ver[2048] = {
  18,  -9,  18,  -9,  18,  -9,  18,  -9,  18,  -9,  18,  -9,  18,  -9,  18,  -9,
 };
 
-ALIGNED(32) const int16_t  fi_dst7_8x16_coeff_ver[2048] = {
+ALIGNED(32) static const int16_t  fi_dst7_8x16_coeff_ver[2048] = {
   8,  25,   8,  25,   8,  25,   8,  25,   8,  25,   8,  25,   8,  25,   8,  25,  // 0
  40,  55,  40,  55,  40,  55,  40,  55,  40,  55,  40,  55,  40,  55,  40,  55,
  68,  77,  68,  77,  68,  77,  68,  77,  68,  77,  68,  77,  68,  77,  68,  77,
@@ -1986,7 +1986,7 @@ ALIGNED(32) const int16_t  fi_dst7_8x16_coeff_ver[2048] = {
  17,  -8,  17,  -8,  17,  -8,  17,  -8,  17,  -8,  17,  -8,  17,  -8,  17,  -8,
 };
 
-ALIGNED(32) const int16_t  fi_dct8_8x16_coeff_ver[2048] = {
+ALIGNED(32) static const int16_t  fi_dct8_8x16_coeff_ver[2048] = {
  88,  88,  88,  88,  88,  88,  88,  88,  88,  88,  88,  88,  88,  88,  88,  88,  // 0
  87,  85,  87,  85,  87,  85,  87,  85,  87,  85,  87,  85,  87,  85,  87,  85,
  81,  77,  81,  77,  81,  77,  81,  77,  81,  77,  81,  77,  81,  77,  81,  77,
@@ -2118,7 +2118,7 @@ ALIGNED(32) const int16_t  fi_dct8_8x16_coeff_ver[2048] = {
 };
 
 
-ALIGNED(32) const int16_t  ff_dct2_8x32_coeff_ver[1024] = {
+ALIGNED(32) static const int16_t  ff_dct2_8x32_coeff_ver[1024] = {
  64,  64,  90,  90,  90,  87,  90,  82,  89,  75,  88,  67,  87,  57,  85,  46,  // 0
  83,  36,  82,  22,  80,   9,  78,  -4,  75, -18,  73, -31,  70, -43,  67, -54,
  64, -64,  61, -73,  57, -80,  54, -85,  50, -89,  46, -90,  43, -90,  38, -88,
@@ -2185,7 +2185,7 @@ ALIGNED(32) const int16_t  ff_dct2_8x32_coeff_ver[1024] = {
 -83,  36,  78, -31, -70,  25,  61, -22, -50,  18,  38, -13, -25,   9,  13,  -4,
 };
 
-ALIGNED(32) const int16_t  ff_dst7_8x32_coeff_ver[1024] = {
+ALIGNED(32) static const int16_t  ff_dst7_8x32_coeff_ver[1024] = {
   4,   9,  13,  26,  21,  42,  30,  56,  38,  68,  46,  78,  53,  85,  60,  89,  // 0
  66,  90,  72,  86,  77,  80,  80,  72,  84,  60,  86,  46,  88,  30,  90,  13,
  90,  -4,  89, -21,  87, -38,  85, -53,  82, -66,  78, -77,  74, -84,  68, -88,
@@ -2252,7 +2252,7 @@ ALIGNED(32) const int16_t  ff_dst7_8x32_coeff_ver[1024] = {
 -82,  34,  77, -30, -68,  26,  60, -21, -50,  17,  38, -13, -26,   9,  13,  -4,
 };
 
-ALIGNED(32) const int16_t  ff_dct8_8x32_coeff_ver[1024] = {
+ALIGNED(32) static const int16_t  ff_dct8_8x32_coeff_ver[1024] = {
  90,  90,  90,  87,  89,  84,  88,  78,  87,  72,  86,  63,  85,  53,  84,  42,  // 0
  82,  30,  80,  17,  78,   4,  77,  -9,  74, -21,  72, -34,  68, -46,  66, -56,
  63, -66,  60, -74,  56, -80,  53, -85,  50, -88,  46, -90,  42, -89,  38, -86,
@@ -2320,15 +2320,15 @@ ALIGNED(32) const int16_t  ff_dct8_8x32_coeff_ver[1024] = {
 };
 
 
-            const int16_t* fi_dct2_8x32_coeff_hor = fi_dct2_8x8_coeff_hor;
+          static const int16_t* fi_dct2_8x32_coeff_hor = fi_dct2_8x8_coeff_hor;
 
-            const int16_t* fi_dst7_8x32_coeff_hor = fi_dst7_8x8_coeff_hor;
+          static const int16_t* fi_dst7_8x32_coeff_hor = fi_dst7_8x8_coeff_hor;
 
-            const int16_t* fi_dct8_8x32_coeff_hor = fi_dct8_8x8_coeff_hor;
+          static const int16_t* fi_dct8_8x32_coeff_hor = fi_dct8_8x8_coeff_hor;
 
 
 // 16xN
-ALIGNED(32) const int16_t  ff_dct2_16xN_coeff_hor[256] = {
+ALIGNED(32) static const int16_t  ff_dct2_16xN_coeff_hor[256] = {
  64,  64,  90,  87,  89,  75,  87,  57,  64, -64,  57, -80,  50, -89,  43, -90,
  64,  64,  80,  70,  50,  18,   9, -43, -64,  64, -25,  90,  18,  75,  57,  25,
  64,  64,  57,  43, -18, -50, -80, -90,  64, -64,  -9, -87, -75, -18, -87,  70,
@@ -2347,7 +2347,7 @@ ALIGNED(32) const int16_t  ff_dct2_16xN_coeff_hor[256] = {
  36,  83,  -9, -80, -18,  75,  43, -70, -83,  36,  70, -25, -50,  18,  25,  -9,
 };
 
-ALIGNED(32) const int16_t  ff_dst7_16xN_coeff_hor[256] = {
+ALIGNED(32) static const int16_t  ff_dst7_16xN_coeff_hor[256] = {
   8,  17,  25,  48,  40,  73,  55,  87,  88,  -8,  87, -40,  81, -68,  73, -85,  // 0
  25,  33,  68,  81,  88,  85,  81,  40, -88,  17, -68,  73, -25,  88,  25,  55,
  40,  48,  88,  88,  62,  25, -17, -68,  87, -25,  33, -88, -48, -48, -88,  48,
@@ -2366,7 +2366,7 @@ ALIGNED(32) const int16_t  ff_dst7_16xN_coeff_hor[256] = {
  25,  81,   0, -77, -25,  73,  48, -68, -81,  33,  68, -25, -48,  17,  25,  -8,
 };
 
-ALIGNED(32) const int16_t  ff_dct8_16xN_coeff_hor[256] = {
+ALIGNED(32) static const int16_t  ff_dct8_16xN_coeff_hor[256] = {
  88,  88,  88,  81,  87,  68,  85,  48,  62, -68,  55, -81,  48, -88,  40, -88,  // 0
  87,  85,  68,  48,  33,  -8,  -8, -62, -55,  73, -17,  88,  25,  68,  62,  17,
  81,  77,  25,   0, -48, -77, -88, -77,  48, -77, -25, -77, -81,   0, -81,  77,
@@ -2386,10 +2386,10 @@ ALIGNED(32) const int16_t  ff_dct8_16xN_coeff_hor[256] = {
 };
 
 
-            const int16_t* ff_dct2_16x2_coeff_ver = ff_dct2_2xN_coeff_hor; // This is identical to existing table
+static const int16_t* ff_dct2_16x2_coeff_ver = ff_dct2_2xN_coeff_hor; // This is identical to existing table
 
 
-ALIGNED(32) const int16_t  fi_dct2_16x2_coeff_hor[512] = {
+ALIGNED(32) static const int16_t  fi_dct2_16x2_coeff_hor[512] = {
  64,  90,  89,  87,  83,  80,  75,  70,  64,  90,  89,  87,  83,  80,  75,  70,  // 0
  64,  57,  50,  43,  36,  25,  18,   9,  64,  57,  50,  43,  36,  25,  18,   9,
  64,  87,  75,  57,  36,   9, -18, -43,  64,  87,  75,  57,  36,   9, -18, -43,
@@ -2424,11 +2424,11 @@ ALIGNED(32) const int16_t  fi_dct2_16x2_coeff_hor[512] = {
  64, -57,  50, -43,  36, -25,  18,  -9,  64, -57,  50, -43,  36, -25,  18,  -9,
 };
 
-            const int16_t* fi_dct2_16x2_coeff_ver = ff_dct2_2xN_coeff_hor; // This is identical to existing table
+          static const int16_t* fi_dct2_16x2_coeff_ver = ff_dct2_2xN_coeff_hor; // This is identical to existing table
 
 
 
-ALIGNED(32) const int16_t  fi_dst7_16x2_coeff_hor[512] = {
+ALIGNED(32) static const int16_t  fi_dst7_16x2_coeff_hor[512] = {
   8,  25,  40,  55,  68,  77,  85,  88,   8,  25,  40,  55,  68,  77,  85,  88,  // 0
  88,  87,  81,  73,  62,  48,  33,  17,  88,  87,  81,  73,  62,  48,  33,  17, 
  17,  48,  73,  87,  88,  77,  55,  25,  17,  48,  73,  87,  88,  77,  55,  25, 
@@ -2464,7 +2464,7 @@ ALIGNED(32) const int16_t  fi_dst7_16x2_coeff_hor[512] = {
 };
 
 
-ALIGNED(32) const int16_t  ff_dct2_16x8_butterfly_o_row_coeff_hor[1024] = {
+ALIGNED(32) static const int16_t  ff_dct2_16x8_butterfly_o_row_coeff_hor[1024] = {
  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  // 0
  87, -87,  87, -87,  87, -87,  87, -87,  87, -87,  87, -87,  87, -87,  87, -87,
  80, -80,  80, -80,  80, -80,  80, -80,  80, -80,  80, -80,  80, -80,  80, -80,
@@ -2532,7 +2532,7 @@ ALIGNED(32) const int16_t  ff_dct2_16x8_butterfly_o_row_coeff_hor[1024] = {
 };
 
 
-ALIGNED(32) const int16_t  ff_dct2_16x4_coeff_ver[128] = {
+ALIGNED(32) static const int16_t  ff_dct2_16x4_coeff_ver[128] = {
  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,
  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,
  83,  36,  83,  36,  83,  36,  83,  36,  83,  36,  83,  36,  83,  36,  83,  36,
@@ -2543,7 +2543,7 @@ ALIGNED(32) const int16_t  ff_dct2_16x4_coeff_ver[128] = {
  83, -36,  83, -36,  83, -36,  83, -36,  83, -36,  83, -36,  83, -36,  83, -36,
 };
 
-ALIGNED(32) const int16_t  ff_dst7_16x4_coeff_ver[128] = {
+ALIGNED(32) static const int16_t  ff_dst7_16x4_coeff_ver[128] = {
  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,
  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,
  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,
@@ -2554,7 +2554,7 @@ ALIGNED(32) const int16_t  ff_dst7_16x4_coeff_ver[128] = {
  74, -29,  74, -29,  74, -29,  74, -29,  74, -29,  74, -29,  74, -29,  74, -29,
 };
 
-ALIGNED(32) const int16_t  ff_dct8_16x4_coeff_ver[128] = {
+ALIGNED(32) static const int16_t  ff_dct8_16x4_coeff_ver[128] = {
  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,
  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,
  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,
@@ -2566,7 +2566,7 @@ ALIGNED(32) const int16_t  ff_dct8_16x4_coeff_ver[128] = {
 };
 
 
-ALIGNED(32) const int16_t  fi_dct2_16x4_coeff_hor[1024] = {
+ALIGNED(32) static const int16_t  fi_dct2_16x4_coeff_hor[1024] = {
  64,  90,  89,  87,  64,  90,  89,  87,  64,  90,  89,  87,  64,  90,  89,  87,  // 0
  83,  80,  75,  70,  83,  80,  75,  70,  83,  80,  75,  70,  83,  80,  75,  70,
  64,  57,  50,  43,  64,  57,  50,  43,  64,  57,  50,  43,  64,  57,  50,  43,
@@ -2633,7 +2633,7 @@ ALIGNED(32) const int16_t  fi_dct2_16x4_coeff_hor[1024] = {
  36, -25,  18,  -9,  36, -25,  18,  -9,  36, -25,  18,  -9,  36, -25,  18,  -9,
 };
 
-ALIGNED(32) const int16_t  fi_dst7_16x4_coeff_hor[1024] = {
+ALIGNED(32) static const int16_t  fi_dst7_16x4_coeff_hor[1024] = {
   8,  25,  40,  55,   8,  25,  40,  55,   8,  25,  40,  55,   8,  25,  40,  55,  // 0
  68,  77,  85,  88,  68,  77,  85,  88,  68,  77,  85,  88,  68,  77,  85,  88,
  88,  87,  81,  73,  88,  87,  81,  73,  88,  87,  81,  73,  88,  87,  81,  73,
@@ -2700,7 +2700,7 @@ ALIGNED(32) const int16_t  fi_dst7_16x4_coeff_hor[1024] = {
  33, -25,  17,  -8,  33, -25,  17,  -8,  33, -25,  17,  -8,  33, -25,  17,  -8,
 };
 
-ALIGNED(32) const int16_t  fi_dct8_16x4_coeff_hor[1024] = {
+ALIGNED(32) static const int16_t  fi_dct8_16x4_coeff_hor[1024] = {
  88,  88,  87,  85,  88,  88,  87,  85,  88,  88,  87,  85,  88,  88,  87,  85,  // 0
  81,  77,  73,  68,  81,  77,  73,  68,  81,  77,  73,  68,  81,  77,  73,  68,
  62,  55,  48,  40,  62,  55,  48,  40,  62,  55,  48,  40,  62,  55,  48,  40,
@@ -2768,7 +2768,7 @@ ALIGNED(32) const int16_t  fi_dct8_16x4_coeff_hor[1024] = {
 };
 
 
-ALIGNED(32) const int16_t  fi_dct2_16x4_coeff_ver[128] = {
+ALIGNED(32) static const int16_t  fi_dct2_16x4_coeff_ver[128] = {
  64,  83,  64,  83,  64,  83,  64,  83,  64,  83,  64,  83,  64,  83,  64,  83,
  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,
  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,
@@ -2779,7 +2779,7 @@ ALIGNED(32) const int16_t  fi_dct2_16x4_coeff_ver[128] = {
  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,
 };
 
-ALIGNED(32) const int16_t  fi_dst7_16x4_coeff_ver[128] = {
+ALIGNED(32) static const int16_t  fi_dst7_16x4_coeff_ver[128] = {
  29,  74,  29,  74,  29,  74,  29,  74,  29,  74,  29,  74,  29,  74,  29,  74,
  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,
  55,  74,  55,  74,  55,  74,  55,  74,  55,  74,  55,  74,  55,  74,  55,  74,
@@ -2790,7 +2790,7 @@ ALIGNED(32) const int16_t  fi_dst7_16x4_coeff_ver[128] = {
  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,
 };
 
-ALIGNED(32) const int16_t  fi_dct8_16x4_coeff_ver[128] = {
+ALIGNED(32) static const int16_t  fi_dct8_16x4_coeff_ver[128] = {
  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,
  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,
  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,
@@ -2802,28 +2802,28 @@ ALIGNED(32) const int16_t  fi_dct8_16x4_coeff_ver[128] = {
 };
 
 
-ALIGNED(32) const int16_t  ff_dct2_16x8_coeff_ver[64] = {
+ALIGNED(32) static const int16_t  ff_dct2_16x8_coeff_ver[64] = {
  64,  64,  89,  75,  83,  36,  75, -18,  64, -64,  50, -89,  36, -83,  18, -50,
  64,  64,  50,  18, -36, -83, -89, -50, -64,  64,  18,  75,  83, -36,  75, -89,
  64,  64, -18, -50, -83, -36,  50,  89,  64, -64, -75, -18, -36,  83,  89, -75,
  64,  64, -75, -89,  36,  83,  18, -75, -64,  64,  89, -50, -83,  36,  50, -18,
 };
 
-ALIGNED(32) const int16_t  ff_dst7_16x8_coeff_ver[64] = {
+ALIGNED(32) static const int16_t  ff_dst7_16x8_coeff_ver[64] = {
  17,  32,  46,  78,  71,  85,  85,  46,  86, -17,  78, -71,  60, -86,  32, -60,
  46,  60,  86,  71,  32, -46, -60, -78, -85,  32, -17,  85,  71, -17,  78, -86,
  71,  78,  32, -17, -86, -60,  17,  86,  78, -46, -60, -32, -46,  85,  85, -71,
  85,  86, -60, -85,  17,  78,  32, -71, -71,  60,  86, -46, -78,  32,  46, -17,
 };
 
-ALIGNED(32) const int16_t  ff_dct8_16x8_coeff_ver[64] = {
+ALIGNED(32) static const int16_t  ff_dct8_16x8_coeff_ver[64] = {
  86,  85,  85,  60,  78,  17,  71, -32,  60, -71,  46, -86,  32, -78,  17, -46,
  78,  71,  17, -32, -60, -86, -86, -17, -46,  78,  32,  60,  85, -46,  71, -85,
  60,  46, -71, -86, -46,  32,  78,  60,  32, -85, -85,  17, -17,  71,  86, -78,
  32,  17, -78, -46,  85,  71, -46, -85, -17,  86,  71, -78, -86,  60,  60, -32,
 };
 
-ALIGNED(32) const int16_t  ff_dct2_16x8_butterfly_coeff_ver[128] = {
+ALIGNED(32) static const int16_t  ff_dct2_16x8_butterfly_coeff_ver[128] = {
    64,  64,  89,  75,  83,  36,  75, -18,  64,  64,  89,  75,  83,  36,  75, -18,
    64,  64,  50,  18, -36, -83, -89, -50,  64,  64,  50,  18, -36, -83, -89, -50,
    64,  64, -18, -50, -83, -36,  50,  89,  64,  64, -18, -50, -83, -36,  50,  89,
@@ -2834,7 +2834,7 @@ ALIGNED(32) const int16_t  ff_dct2_16x8_butterfly_coeff_ver[128] = {
   -64,  64,  89, -50, -83,  36,  50, -18, -64,  64,  89, -50, -83,  36,  50, -18
 };
 
-ALIGNED(32) const int16_t  ff_dct2_16x8_butterfly_o_row_coeff_ver[256] = {
+ALIGNED(32) static const int16_t  ff_dct2_16x8_butterfly_o_row_coeff_ver[256] = {
  89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  // 0
  75, -75,  75, -75,  75, -75,  75, -75,  75, -75,  75, -75,  75, -75,  75, -75,
  50, -50,  50, -50,  50, -50,  50, -50,  50, -50,  50, -50,  50, -50,  50, -50,
@@ -2854,21 +2854,21 @@ ALIGNED(32) const int16_t  ff_dct2_16x8_butterfly_o_row_coeff_ver[256] = {
 };
 
 
-            const int16_t* fi_dct2_16x8_coeff_hor = fi_dct2_8x16_coeff_ver; // Duplicate table.
+          static const int16_t* fi_dct2_16x8_coeff_hor = fi_dct2_8x16_coeff_ver; // Duplicate table.
            
-            const int16_t* fi_dst7_16x8_coeff_hor = fi_dst7_8x16_coeff_ver; // Duplicate table.
+          static const int16_t* fi_dst7_16x8_coeff_hor = fi_dst7_8x16_coeff_ver; // Duplicate table.
            
-            const int16_t* fi_dct8_16x8_coeff_hor = fi_dct8_8x16_coeff_ver; // Duplicate table.
+          static const int16_t* fi_dct8_16x8_coeff_hor = fi_dct8_8x16_coeff_ver; // Duplicate table.
 
 
-            const int16_t* fi_dct2_16x8_coeff_ver = fi_dct2_8x8_coeff_hor;  // Duplicate table
+          static const int16_t* fi_dct2_16x8_coeff_ver = fi_dct2_8x8_coeff_hor;  // Duplicate table
            
-            const int16_t* fi_dst7_16x8_coeff_ver = fi_dst7_8x8_coeff_hor;  // Duplicate table
+          static const int16_t* fi_dst7_16x8_coeff_ver = fi_dst7_8x8_coeff_hor;  // Duplicate table
            
-            const int16_t* fi_dct8_16x8_coeff_ver = fi_dct8_8x8_coeff_hor;  // Duplicate table
+          static const int16_t* fi_dct8_16x8_coeff_ver = fi_dct8_8x8_coeff_hor;  // Duplicate table
 
 
-ALIGNED(32) const int16_t  ff_dct2_16x16_coeff_ver[256] = {
+ALIGNED(32) static const int16_t  ff_dct2_16x16_coeff_ver[256] = {
  64,  64,  90,  87,  89,  75,  87,  57,  83,  36,  80,   9,  75, -18,  70, -43,  // 0
  64, -64,  57, -80,  50, -89,  43, -90,  36, -83,  25, -70,  18, -50,   9, -25,
  64,  64,  80,  70,  50,  18,   9, -43, -36, -83, -70, -87, -89, -50, -87,   9,
@@ -2887,7 +2887,7 @@ ALIGNED(32) const int16_t  ff_dct2_16x16_coeff_ver[256] = {
 -64,  64,  80, -57, -89,  50,  90, -43, -83,  36,  70, -25, -50,  18,  25,  -9,
 };
 
-ALIGNED(32) const int16_t  ff_dst7_16x16_coeff_ver[256] = {
+ALIGNED(32) static const int16_t  ff_dst7_16x16_coeff_ver[256] = {
   8,  17,  25,  48,  40,  73,  55,  87,  68,  88,  77,  77,  85,  55,  88,  25,  // 0
  88,  -8,  87, -40,  81, -68,  73, -85,  62, -88,  48, -81,  33, -62,  17, -33,
  25,  33,  68,  81,  88,  85,  81,  40,  48, -25,   0, -77, -48, -87, -81, -48,
@@ -2906,7 +2906,7 @@ ALIGNED(32) const int16_t  ff_dst7_16x16_coeff_ver[256] = {
 -68,  62,  81, -55, -88,  48,  88, -40, -81,  33,  68, -25, -48,  17,  25,  -8,
 };
 
-ALIGNED(32) const int16_t  ff_dct8_16x16_coeff_ver[256] = {
+ALIGNED(32) static const int16_t  ff_dct8_16x16_coeff_ver[256] = {
  88,  88,  88,  81,  87,  68,  85,  48,  81,  25,  77,   0,  73, -25,  68, -48,  // 0
  62, -68,  55, -81,  48, -88,  40, -88,  33, -81,  25, -68,  17, -48,   8, -25,
  87,  85,  68,  48,  33,  -8,  -8, -62, -48, -88, -77, -77, -88, -33, -81,  25,
@@ -2926,7 +2926,7 @@ ALIGNED(32) const int16_t  ff_dct8_16x16_coeff_ver[256] = {
 };
 
 
-ALIGNED(32) const int16_t  fi_dct2_16x16_coeff_hor[256] = {
+ALIGNED(32) static const int16_t  fi_dct2_16x16_coeff_hor[256] = {
  64,  90,  64,  87,  64,  80,  64,  70,  64,  57,  64,  43,  64,  25,  64,   9,  // 0
  64,  -9,  64, -25,  64, -43,  64, -57,  64, -70,  64, -80,  64, -87,  64, -90,
  89,  87,  75,  57,  50,   9,  18, -43, -18, -80, -50, -90, -75, -70, -89, -25,
@@ -2945,7 +2945,7 @@ ALIGNED(32) const int16_t  fi_dct2_16x16_coeff_hor[256] = {
 -18,  90,  50, -87, -75,  80,  89, -70, -89,  57,  75, -43, -50,  25,  18,  -9,
 };
 
-ALIGNED(32) const int16_t  fi_dst7_16x16_coeff_hor[256] = {
+ALIGNED(32) static const int16_t  fi_dst7_16x16_coeff_hor[256] = {
   8,  25,  17,  48,  25,  68,  33,  81,  40,  88,  48,  88,  55,  81,  62,  68,  // 0
  68,  48,  73,  25,  77,   0,  81, -25,  85, -48,  87, -68,  88, -81,  88, -88,
  40,  55,  73,  87,  88,  81,  85,  40,  62, -17,  25, -68, -17, -88, -55, -73,
@@ -2964,7 +2964,7 @@ ALIGNED(32) const int16_t  fi_dst7_16x16_coeff_hor[256] = {
 -25,  88,  55, -85, -77,  77,  88, -68, -87,  55,  73, -40, -48,  25,  17,  -8,
 };
 
-ALIGNED(32) const int16_t  fi_dct2_16x1_coeff_hor[256] = {
+ALIGNED(32) static const int16_t  fi_dct2_16x1_coeff_hor[256] = {
  64,  90,  64,  87,  64,  80,  64,  70,  64,  57,  64,  43,  64,  25,  64,   9,  // 0
  89,  87,  75,  57,  50,   9,  18, -43, -18, -80, -50, -90, -75, -70, -89, -25,
  83,  80,  36,   9, -36, -70, -83, -87, -83, -25, -36,  57,  36,  90,  83,  43,
@@ -2983,7 +2983,7 @@ ALIGNED(32) const int16_t  fi_dct2_16x1_coeff_hor[256] = {
 -18,  90,  50, -87, -75,  80,  89, -70, -89,  57,  75, -43, -50,  25,  18,  -9,
 };
 
-ALIGNED(32) const int16_t  fi_dst7_16x1_coeff_hor[256] = {
+ALIGNED(32) static const int16_t  fi_dst7_16x1_coeff_hor[256] = {
   8,  25,  17,  48,  25,  68,  33,  81,  40,  88,  48,  88,  55,  81,  62,  68,  // 0
  40,  55,  73,  87,  88,  81,  85,  40,  62, -17,  25, -68, -17, -88, -55, -73,
  68,  77,  88,  77,  48,   0, -25, -77, -81, -77, -81,   0, -25,  77,  48,  77,
@@ -3002,17 +3002,17 @@ ALIGNED(32) const int16_t  fi_dst7_16x1_coeff_hor[256] = {
 -25,  88,  55, -85, -77,  77,  88, -68, -87,  55,  73, -40, -48,  25,  17,  -8,
 };
 
-ALIGNED(32) const int16_t* fi_dct8_16x16_coeff_hor = ff_dct8_16x16_coeff_ver;
+ALIGNED(32) static const int16_t* fi_dct8_16x16_coeff_hor = ff_dct8_16x16_coeff_ver;
 
 
-            const int16_t* fi_dct2_16x16_coeff_ver = fi_dct2_16x16_coeff_hor;
+          static const int16_t* fi_dct2_16x16_coeff_ver = fi_dct2_16x16_coeff_hor;
 
-            const int16_t* fi_dst7_16x16_coeff_ver = fi_dst7_16x16_coeff_hor;
+          static const int16_t* fi_dst7_16x16_coeff_ver = fi_dst7_16x16_coeff_hor;
 
-            const int16_t* fi_dct8_16x16_coeff_ver = ff_dct8_16x16_coeff_ver;
+          static const int16_t* fi_dct8_16x16_coeff_ver = ff_dct8_16x16_coeff_ver;
 
 
-ALIGNED(32) const int16_t  ff_dct2_16x32_butterfly_o_row_coeff_ver[4096] = { // TODO: change this to 32-bit combined coeff table at some point, these huge tables are getting out of hand
+ALIGNED(32) static const int16_t  ff_dct2_16x32_butterfly_o_row_coeff_ver[4096] = { // TODO: change this to 32-bit combined coeff table at some point, these huge tables are getting out of hand
  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  // 0
  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,
  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,
@@ -3271,7 +3271,7 @@ ALIGNED(32) const int16_t  ff_dct2_16x32_butterfly_o_row_coeff_ver[4096] = { //
 -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90,
 };
 
-ALIGNED(32) const int16_t  ff_dct2_16x32_coeff_ver[1024] = {
+ALIGNED(32) static const int16_t  ff_dct2_16x32_coeff_ver[1024] = {
  64,  64,  90,  90,  90,  87,  90,  82,  89,  75,  88,  67,  87,  57,  85,  46,  // 0
  83,  36,  82,  22,  80,   9,  78,  -4,  75, -18,  73, -31,  70, -43,  67, -54,
  64, -64,  61, -73,  57, -80,  54, -85,  50, -89,  46, -90,  43, -90,  38, -88,
@@ -3338,7 +3338,7 @@ ALIGNED(32) const int16_t  ff_dct2_16x32_coeff_ver[1024] = {
 -83,  36,  78, -31, -70,  25,  61, -22, -50,  18,  38, -13, -25,   9,  13,  -4,
 };
 
-ALIGNED(32) const int16_t  ff_dst7_16x32_coeff_ver[1024] = {
+ALIGNED(32) static const int16_t  ff_dst7_16x32_coeff_ver[1024] = {
   4,   9,  13,  26,  21,  42,  30,  56,  38,  68,  46,  78,  53,  85,  60,  89,  // 0
  66,  90,  72,  86,  77,  80,  80,  72,  84,  60,  86,  46,  88,  30,  90,  13,
  90,  -4,  89, -21,  87, -38,  85, -53,  82, -66,  78, -77,  74, -84,  68, -88,
@@ -3405,7 +3405,7 @@ ALIGNED(32) const int16_t  ff_dst7_16x32_coeff_ver[1024] = {
 -82,  34,  77, -30, -68,  26,  60, -21, -50,  17,  38, -13, -26,   9,  13,  -4,
 };
 
-ALIGNED(32) const int16_t  ff_dct8_16x32_coeff_ver[1024] = {
+ALIGNED(32) static const int16_t  ff_dct8_16x32_coeff_ver[1024] = {
  90,  90,  90,  87,  89,  84,  88,  78,  87,  72,  86,  63,  85,  53,  84,  42,  // 0
  82,  30,  80,  17,  78,   4,  77,  -9,  74, -21,  72, -34,  68, -46,  66, -56,
  63, -66,  60, -74,  56, -80,  53, -85,  50, -88,  46, -90,  42, -89,  38, -86,
@@ -3473,14 +3473,14 @@ ALIGNED(32) const int16_t  ff_dct8_16x32_coeff_ver[1024] = {
 };
 
 
-            const int16_t* fi_dct2_16x32_coeff_hor = fi_dct2_16x16_coeff_hor;
+          static const int16_t* fi_dct2_16x32_coeff_hor = fi_dct2_16x16_coeff_hor;
 
-            const int16_t* fi_dst7_16x32_coeff_hor = fi_dst7_16x16_coeff_hor;
+          static const int16_t* fi_dst7_16x32_coeff_hor = fi_dst7_16x16_coeff_hor;
 
-            const int16_t* fi_dct8_16x32_coeff_hor = ff_dct8_16x16_coeff_ver;
+          static const int16_t* fi_dct8_16x32_coeff_hor = ff_dct8_16x16_coeff_ver;
 
 // 32xN
-ALIGNED(32) const int16_t  ff_dct2_32xN_coeff_hor[1024] = {
+ALIGNED(32) static const int16_t  ff_dct2_32xN_coeff_hor[1024] = {
  64,  64,  90,  90,  90,  87,  90,  82,  89,  75,  88,  67,  87,  57,  85,  46,  // 0
  83,  36,  82,  22,  80,   9,  78,  -4,  75, -18,  73, -31,  70, -43,  67, -54,
  64, -64,  61, -73,  57, -80,  54, -85,  50, -89,  46, -90,  43, -90,  38, -88,
@@ -3547,7 +3547,7 @@ ALIGNED(32) const int16_t  ff_dct2_32xN_coeff_hor[1024] = {
 -83,  36,  78, -31, -70,  25,  61, -22, -50,  18,  38, -13, -25,   9,  13,  -4,
 };
 
-ALIGNED(32) const int16_t  ff_dst7_32xN_coeff_hor[1024] = {
+ALIGNED(32) static const int16_t  ff_dst7_32xN_coeff_hor[1024] = {
  4,   9,  13,  26,  21,  42,  30,  56,  38,  68,  46,  78,  53,  85,  60,  89,  66,  90,  72,  86,  77,  80,  80,  72,  84,  60,  86,  46,  88,  30,  90,  13,  // 0
  90,  -4,  89, -21,  87, -38,  85, -53,  82, -66,  78, -77,  74, -84,  68, -88,  63, -90,  56, -87,  50, -82,  42, -74,  34, -63,  26, -50,  17, -34,   9, -17,
  13,  17,  38,  50,  60,  74,  77,  87,  86,  88,  90,  77,  85,  53,  74,  21,  56, -13,  34, -46,   9, -72, -17, -86, -42, -89, -63, -78, -78, -56, -87, -26,  // 2
@@ -3582,7 +3582,7 @@ ALIGNED(32) const int16_t  ff_dst7_32xN_coeff_hor[1024] = {
 -66,  63,  74, -60, -80,  56,  85, -53, -88,  50,  90, -46, -89,  42,  86, -38, -82,  34,  77, -30, -68,  26,  60, -21, -50,  17,  38, -13, -26,   9,  13,  -4,
 };
 
-ALIGNED(32) const int16_t  ff_dct8_32xN_coeff_hor[1024] = {
+ALIGNED(32) static const int16_t  ff_dct8_32xN_coeff_hor[1024] = {
  90,  90,  90,  87,  89,  84,  88,  78,  87,  72,  86,  63,  85,  53,  84,  42,  82,  30,  80,  17,  78,   4,  77,  -9,  74, -21,  72, -34,  68, -46,  66, -56,  // 0
  63, -66,  60, -74,  56, -80,  53, -85,  50, -88,  46, -90,  42, -89,  38, -86,  34, -82,  30, -77,  26, -68,  21, -60,  17, -50,  13, -38,   9, -26,   4, -13,
  89,  88,  84,  78,  74,  60,  60,  34,  42,   4,  21, -26,   0, -53, -21, -74, -42, -86, -60, -90, -74, -82, -84, -66, -89, -42, -89, -13, -84,  17, -74,  46,  // 2
@@ -3617,7 +3617,7 @@ ALIGNED(32) const int16_t  ff_dct8_32xN_coeff_hor[1024] = {
  -4,  90,  21, -89, -38,  87,  53, -85, -66,  82,  77, -78, -84,  74,  88, -68, -90,  63,  87, -56, -82,  50,  74, -42, -63,  34,  50, -26, -34,  17,  17,  -9,
 };
 
-ALIGNED(32) const int16_t  fi_dct2_32xN_coeff_hor[1024] = {
+ALIGNED(32) static const int16_t  fi_dct2_32xN_coeff_hor[1024] = {
 64,  90,  64,  90,  64,  88,  64,  85,  64,  82,  64,  78,  64,  73,  64,  67,  64,  61,  64,  54,  64,  46,  64,  38,  64,  31,  64,  22,  64,  13,  64,   4,  // 0
  64,  -4,  64, -13,  64, -22,  64, -31,  64, -38,  64, -46,  64, -54,  64, -61,  64, -67,  64, -73,  64, -78,  64, -82,  64, -85,  64, -88,  64, -90,  64, -90,
  90,  90,  87,  82,  80,  67,  70,  46,  57,  22,  43,  -4,  25, -31,   9, -54,  -9, -73, -25, -85, -43, -90, -57, -88, -70, -78, -80, -61, -87, -38, -90, -13,  // 2
@@ -3653,7 +3653,7 @@ ALIGNED(32) const int16_t  fi_dct2_32xN_coeff_hor[1024] = {
 };
 
 
-ALIGNED(32) const int16_t  fi_dst7_32xN_coeff_hor[1024] = {
+ALIGNED(32) static const int16_t  fi_dst7_32xN_coeff_hor[1024] = {
  4,  13,   9,  26,  13,  38,  17,  50,  21,  60,  26,  68,  30,  77,  34,  82,  38,  86,  42,  89,  46,  90,  50,  88,  53,  85,  56,  80,  60,  74,  63,  66,  // 0
  66,  56,  68,  46,  72,  34,  74,  21,  77,   9,  78,  -4,  80, -17,  82, -30,  84, -42,  85, -53,  86, -63,  87, -72,  88, -78,  89, -84,  90, -87,  90, -90,
  21,  30,  42,  56,  60,  77,  74,  87,  84,  89,  89,  80,  89,  63,  84,  38,  74,   9,  60, -21,  42, -50,  21, -72,   0, -85, -21, -90, -42, -84, -60, -68,  // 2
@@ -3689,7 +3689,7 @@ ALIGNED(32) const int16_t  fi_dst7_32xN_coeff_hor[1024] = {
 };
 
 
-ALIGNED(32) const int16_t  fi_dct8_32xN_coeff_hor[1024] = {
+ALIGNED(32) static const int16_t  fi_dct8_32xN_coeff_hor[1024] = {
 90,  90,  90,  87,  89,  84,  88,  78,  87,  72,  86,  63,  85,  53,  84,  42,  82,  30,  80,  17,  78,   4,  77,  -9,  74, -21,  72, -34,  68, -46,  66, -56,  // 0
  63, -66,  60, -74,  56, -80,  53, -85,  50, -88,  46, -90,  42, -89,  38, -86,  34, -82,  30, -77,  26, -68,  21, -60,  17, -50,  13, -38,   9, -26,   4, -13,
  89,  88,  84,  78,  74,  60,  60,  34,  42,   4,  21, -26,   0, -53, -21, -74, -42, -86, -60, -90, -74, -82, -84, -66, -89, -42, -89, -13, -84,  17, -74,  46,  // 2
@@ -3724,7 +3724,7 @@ ALIGNED(32) const int16_t  fi_dct8_32xN_coeff_hor[1024] = {
  -4,  90,  21, -89, -38,  87,  53, -85, -66,  82,  77, -78, -84,  74,  88, -68, -90,  63,  87, -56, -82,  50,  74, -42, -63,  34,  50, -26, -34,  17,  17,  -9,
 };
 
-const int16_t ff_dct8_4x32_coeff_ver[1024] = {
+static const int16_t ff_dct8_4x32_coeff_ver[1024] = {
 90,  90,  90,  87,  89,  84,  88,  78,  87,  72,  86,  63,  85,  53,  84,  42,  82,  30,  80,  17,  78,   4,  77,  -9,  74, -21,  72, -34,  68, -46,  66, -56,  // 0
  63, -66,  60, -74,  56, -80,  53, -85,  50, -88,  46, -90,  42, -89,  38, -86,  34, -82,  30, -77,  26, -68,  21, -60,  17, -50,  13, -38,   9, -26,   4, -13,
  89,  88,  84,  78,  74,  60,  60,  34,  42,   4,  21, -26,   0, -53, -21, -74, -42, -86, -60, -90, -74, -82, -84, -66, -89, -42, -89, -13, -84,  17, -74,  46,  // 2
@@ -3758,7 +3758,7 @@ const int16_t ff_dct8_4x32_coeff_ver[1024] = {
   9,   4, -26, -13,  42,  21, -56, -30,  68,  38, -78, -46,  85,  53, -89, -60,  90,  66, -86, -72,  80,  77, -72, -80,  60,  84, -46, -86,  30,  88, -13, -90,  // 30
  -4,  90,  21, -89, -38,  87,  53, -85, -66,  82,  77, -78, -84,  74,  88, -68, -90,  63,  87, -56, -82,  50,  74, -42, -63,  34,  50, -26, -34,  17,  17,  -9,
 };
-const int16_t ff_dst7_4x32_coeff_ver[1024] = {
+static const int16_t ff_dst7_4x32_coeff_ver[1024] = {
   4,   9,  13,  26,  21,  42,  30,  56,  38,  68,  46,  78,  53,  85,  60,  89,  66,  90,  72,  86,  77,  80,  80,  72,  84,  60,  86,  46,  88,  30,  90,  13,  // 0
  90,  -4,  89, -21,  87, -38,  85, -53,  82, -66,  78, -77,  74, -84,  68, -88,  63, -90,  56, -87,  50, -82,  42, -74,  34, -63,  26, -50,  17, -34,   9, -17,
  13,  17,  38,  50,  60,  74,  77,  87,  86,  88,  90,  77,  85,  53,  74,  21,  56, -13,  34, -46,   9, -72, -17, -86, -42, -89, -63, -78, -78, -56, -87, -26,  // 2
@@ -3793,13 +3793,13 @@ const int16_t ff_dst7_4x32_coeff_ver[1024] = {
 -66,  63,  74, -60, -80,  56,  85, -53, -88,  50,  90, -46, -89,  42,  86, -38, -82,  34,  77, -30, -68,  26,  60, -21, -50,  17,  38, -13, -26,   9,  13,  -4,
 };
 
-            const int16_t* ff_dct2_32x2_coeff_ver = ff_dct2_2xN_coeff_hor; // This is identical to existing table
+          static const int16_t* ff_dct2_32x2_coeff_ver = ff_dct2_2xN_coeff_hor; // This is identical to existing table
 
 
-            const int16_t* fi_dct2_32x2_coeff_ver = ff_dct2_2xN_coeff_hor;
+          static const int16_t* fi_dct2_32x2_coeff_ver = ff_dct2_2xN_coeff_hor;
 
 
-ALIGNED(32) const int16_t  ff_dct2_32x4_butterfly_eo_row_coeff_hor[512] = {
+ALIGNED(32) static const int16_t  ff_dct2_32x4_butterfly_eo_row_coeff_hor[512] = {
  90,  90,  87,  87,  90,  90,  87,  87,  90,  90,  87,  87,  90,  90,  87,  87,  // 0
  80,  80,  70,  70,  80,  80,  70,  70,  80,  80,  70,  70,  80,  80,  70,  70,
  57,  57,  43,  43,  57,  57,  43,  43,  57,  57,  43,  43,  57,  57,  43,  43,
@@ -3834,7 +3834,7 @@ ALIGNED(32) const int16_t  ff_dct2_32x4_butterfly_eo_row_coeff_hor[512] = {
  87,  87, -90, -90,  87,  87, -90, -90,  87,  87, -90, -90,  87,  87, -90, -90,
 };
 
-ALIGNED(32) const int16_t  ff_dct2_32x4_butterfly_o_row_coeff_hor[2048] = { // TODO: change this to 32-bit combined coeff table at some point, these huge tables are getting out of hand
+ALIGNED(32) static const int16_t  ff_dct2_32x4_butterfly_o_row_coeff_hor[2048] = { // TODO: change this to 32-bit combined coeff table at some point, these huge tables are getting out of hand
  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  // 0
  88, -88,  85, -85,  88, -88,  85, -85,  88, -88,  85, -85,  88, -88,  85, -85,
  82, -82,  78, -78,  82, -82,  78, -78,  82, -82,  78, -78,  82, -82,  78, -78,
@@ -3966,7 +3966,7 @@ ALIGNED(32) const int16_t  ff_dct2_32x4_butterfly_o_row_coeff_hor[2048] = { // T
 };
 
 
-ALIGNED(32) const int16_t  ff_dct2_32x4_coeff_ver[128] = {
+ALIGNED(32) static const int16_t  ff_dct2_32x4_coeff_ver[128] = {
  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,
  83,  36,  83,  36,  83,  36,  83,  36,  83,  36,  83,  36,  83,  36,  83,  36,
  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,
@@ -3977,7 +3977,7 @@ ALIGNED(32) const int16_t  ff_dct2_32x4_coeff_ver[128] = {
  83, -36,  83, -36,  83, -36,  83, -36,  83, -36,  83, -36,  83, -36,  83, -36,
 };
 
-ALIGNED(32) const int16_t  ff_dst7_32x4_coeff_ver[128] = {
+ALIGNED(32) static const int16_t  ff_dst7_32x4_coeff_ver[128] = {
  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,
  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,
  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,
@@ -3988,7 +3988,7 @@ ALIGNED(32) const int16_t  ff_dst7_32x4_coeff_ver[128] = {
  74, -29,  74, -29,  74, -29,  74, -29,  74, -29,  74, -29,  74, -29,  74, -29,
 };
 
-ALIGNED(32) const int16_t  ff_dct8_32x4_coeff_ver[128] = {
+ALIGNED(32) static const int16_t  ff_dct8_32x4_coeff_ver[128] = {
  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,
  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,
  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,
@@ -4000,7 +4000,7 @@ ALIGNED(32) const int16_t  ff_dct8_32x4_coeff_ver[128] = {
 };
 
 
-ALIGNED(32) const int16_t  fi_dct2_32x4_coeff_ver[128] = {
+ALIGNED(32) static const int16_t  fi_dct2_32x4_coeff_ver[128] = {
  64,  83,  64,  83,  64,  83,  64,  83,  64,  83,  64,  83,  64,  83,  64,  83,
  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,
  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,
@@ -4011,7 +4011,7 @@ ALIGNED(32) const int16_t  fi_dct2_32x4_coeff_ver[128] = {
  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,
 };
 
-ALIGNED(32) const int16_t  fi_dst7_32x4_coeff_ver[128] = {
+ALIGNED(32) static const int16_t  fi_dst7_32x4_coeff_ver[128] = {
  29,  74,  29,  74,  29,  74,  29,  74,  29,  74,  29,  74,  29,  74,  29,  74,
  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,
  55,  74,  55,  74,  55,  74,  55,  74,  55,  74,  55,  74,  55,  74,  55,  74,
@@ -4022,7 +4022,7 @@ ALIGNED(32) const int16_t  fi_dst7_32x4_coeff_ver[128] = {
  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,
 };
 
-ALIGNED(32) const int16_t  fi_dct8_32x4_coeff_ver[128] = {
+ALIGNED(32) static const int16_t  fi_dct8_32x4_coeff_ver[128] = {
  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,
  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,
  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,
@@ -4034,7 +4034,7 @@ ALIGNED(32) const int16_t  fi_dct8_32x4_coeff_ver[128] = {
 };
 
 
-ALIGNED(32) const int16_t  ff_dct2_32x8_coeff_ver[512] = {
+ALIGNED(32) static const int16_t  ff_dct2_32x8_coeff_ver[512] = {
  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  // 0
  89,  75,  89,  75,  89,  75,  89,  75,  89,  75,  89,  75,  89,  75,  89,  75,
  83,  36,  83,  36,  83,  36,  83,  36,  83,  36,  83,  36,  83,  36,  83,  36,
@@ -4069,7 +4069,7 @@ ALIGNED(32) const int16_t  ff_dct2_32x8_coeff_ver[512] = {
  50, -18,  50, -18,  50, -18,  50, -18,  50, -18,  50, -18,  50, -18,  50, -18,
 };
 
-ALIGNED(32) const int16_t  ff_dst7_32x8_coeff_ver[512] = {
+ALIGNED(32) static const int16_t  ff_dst7_32x8_coeff_ver[512] = {
  17,  32,  17,  32,  17,  32,  17,  32,  17,  32,  17,  32,  17,  32,  17,  32,  // 0
  46,  78,  46,  78,  46,  78,  46,  78,  46,  78,  46,  78,  46,  78,  46,  78,
  71,  85,  71,  85,  71,  85,  71,  85,  71,  85,  71,  85,  71,  85,  71,  85,
@@ -4104,7 +4104,7 @@ ALIGNED(32) const int16_t  ff_dst7_32x8_coeff_ver[512] = {
  46, -17,  46, -17,  46, -17,  46, -17,  46, -17,  46, -17,  46, -17,  46, -17,
 };
 
-ALIGNED(32) const int16_t  ff_dct8_32x8_coeff_ver[512] = {
+ALIGNED(32) static const int16_t  ff_dct8_32x8_coeff_ver[512] = {
  86,  85,  86,  85,  86,  85,  86,  85,  86,  85,  86,  85,  86,  85,  86,  85,  // 0
  85,  60,  85,  60,  85,  60,  85,  60,  85,  60,  85,  60,  85,  60,  85,  60,
  78,  17,  78,  17,  78,  17,  78,  17,  78,  17,  78,  17,  78,  17,  78,  17,
@@ -4140,7 +4140,7 @@ ALIGNED(32) const int16_t  ff_dct8_32x8_coeff_ver[512] = {
 };
 
 
-ALIGNED(32) const int16_t  fi_dct2_32x8_coeff_ver[256] = {
+ALIGNED(32) static const int16_t  fi_dct2_32x8_coeff_ver[256] = {
  64,  89,  83,  75,  64,  89,  83,  75,  64,  89,  83,  75,  64,  89,  83,  75,  // 0
  64,  50,  36,  18,  64,  50,  36,  18,  64,  50,  36,  18,  64,  50,  36,  18,
  64,  75,  36, -18,  64,  75,  36, -18,  64,  75,  36, -18,  64,  75,  36, -18,
@@ -4159,7 +4159,7 @@ ALIGNED(32) const int16_t  fi_dct2_32x8_coeff_ver[256] = {
  64, -50,  36, -18,  64, -50,  36, -18,  64, -50,  36, -18,  64, -50,  36, -18,
 };
 
-ALIGNED(32) const int16_t  fi_dst7_32x8_coeff_ver[256] = {
+ALIGNED(32) static const int16_t  fi_dst7_32x8_coeff_ver[256] = {
  17,  46,  71,  85,  17,  46,  71,  85,  17,  46,  71,  85,  17,  46,  71,  85,  // 0
  86,  78,  60,  32,  86,  78,  60,  32,  86,  78,  60,  32,  86,  78,  60,  32,
  32,  78,  85,  46,  32,  78,  85,  46,  32,  78,  85,  46,  32,  78,  85,  46,
@@ -4178,7 +4178,7 @@ ALIGNED(32) const int16_t  fi_dst7_32x8_coeff_ver[256] = {
  60, -46,  32, -17,  60, -46,  32, -17,  60, -46,  32, -17,  60, -46,  32, -17,
 };
 
-ALIGNED(32) const int16_t  fi_dct8_32x8_coeff_ver[256] = {
+ALIGNED(32) static const int16_t  fi_dct8_32x8_coeff_ver[256] = {
  86,  85,  78,  71,  86,  85,  78,  71,  86,  85,  78,  71,  86,  85,  78,  71,  // 0
  60,  46,  32,  17,  60,  46,  32,  17,  60,  46,  32,  17,  60,  46,  32,  17,
  85,  60,  17, -32,  85,  60,  17, -32,  85,  60,  17, -32,  85,  60,  17, -32,
@@ -4198,7 +4198,7 @@ ALIGNED(32) const int16_t  fi_dct8_32x8_coeff_ver[256] = {
 };
 
 
-ALIGNED(32) const int16_t  ff_dct2_32x16_coeff_ver[256] = {
+ALIGNED(32) static const int16_t  ff_dct2_32x16_coeff_ver[256] = {
  64,  64,  90,  87,  89,  75,  87,  57,  83,  36,  80,   9,  75, -18,  70, -43,  // 0
  64, -64,  57, -80,  50, -89,  43, -90,  36, -83,  25, -70,  18, -50,   9, -25,
  64,  64,  80,  70,  50,  18,   9, -43, -36, -83, -70, -87, -89, -50, -87,   9,
@@ -4217,7 +4217,7 @@ ALIGNED(32) const int16_t  ff_dct2_32x16_coeff_ver[256] = {
 -64,  64,  80, -57, -89,  50,  90, -43, -83,  36,  70, -25, -50,  18,  25,  -9,
 };
 
-ALIGNED(32) const int16_t  ff_dst7_32x16_coeff_ver[256] = {
+ALIGNED(32) static const int16_t  ff_dst7_32x16_coeff_ver[256] = {
   8,  17,  25,  48,  40,  73,  55,  87,  68,  88,  77,  77,  85,  55,  88,  25,  // 0
  88,  -8,  87, -40,  81, -68,  73, -85,  62, -88,  48, -81,  33, -62,  17, -33,
  25,  33,  68,  81,  88,  85,  81,  40,  48, -25,   0, -77, -48, -87, -81, -48,
@@ -4236,7 +4236,7 @@ ALIGNED(32) const int16_t  ff_dst7_32x16_coeff_ver[256] = {
 -68,  62,  81, -55, -88,  48,  88, -40, -81,  33,  68, -25, -48,  17,  25,  -8,
 };
 
-ALIGNED(32) const int16_t  ff_dct8_32x16_coeff_ver[256] = {
+ALIGNED(32) static const int16_t  ff_dct8_32x16_coeff_ver[256] = {
  88,  88,  88,  81,  87,  68,  85,  48,  81,  25,  77,   0,  73, -25,  68, -48,  // 0
  62, -68,  55, -81,  48, -88,  40, -88,  33, -81,  25, -68,  17, -48,   8, -25,
  87,  85,  68,  48,  33,  -8,  -8, -62, -48, -88, -77, -77, -88, -33, -81,  25,
@@ -4256,7 +4256,7 @@ ALIGNED(32) const int16_t  ff_dct8_32x16_coeff_ver[256] = {
 };
 
 
-ALIGNED(32) const int16_t  fi_dct2_32x16_coeff_ver[256] = {
+ALIGNED(32) static const int16_t  fi_dct2_32x16_coeff_ver[256] = {
  64,  90,  64,  87,  64,  80,  64,  70,  64,  57,  64,  43,  64,  25,  64,   9,  // 0
  64,  -9,  64, -25,  64, -43,  64, -57,  64, -70,  64, -80,  64, -87,  64, -90,
  89,  87,  75,  57,  50,   9,  18, -43, -18, -80, -50, -90, -75, -70, -89, -25,
@@ -4275,7 +4275,7 @@ ALIGNED(32) const int16_t  fi_dct2_32x16_coeff_ver[256] = {
 -18,  90,  50, -87, -75,  80,  89, -70, -89,  57,  75, -43, -50,  25,  18,  -9,
 };
 
-ALIGNED(32) const int16_t  fi_dst7_32x16_coeff_ver[256] = {
+ALIGNED(32) static const int16_t  fi_dst7_32x16_coeff_ver[256] = {
   8,  25,  17,  48,  25,  68,  33,  81,  40,  88,  48,  88,  55,  81,  62,  68,  // 0
  68,  48,  73,  25,  77,   0,  81, -25,  85, -48,  87, -68,  88, -81,  88, -88,
  40,  55,  73,  87,  88,  81,  85,  40,  62, -17,  25, -68, -17, -88, -55, -73,
@@ -4294,7 +4294,7 @@ ALIGNED(32) const int16_t  fi_dst7_32x16_coeff_ver[256] = {
 -25,  88,  55, -85, -77,  77,  88, -68, -87,  55,  73, -40, -48,  25,  17,  -8,
 };
 
-ALIGNED(32) const int16_t  fi_dct8_32x16_coeff_ver[256] = {
+ALIGNED(32) static const int16_t  fi_dct8_32x16_coeff_ver[256] = {
  88,  88,  88,  81,  87,  68,  85,  48,  81,  25,  77,   0,  73, -25,  68, -48,  // 0
  62, -68,  55, -81,  48, -88,  40, -88,  33, -81,  25, -68,  17, -48,   8, -25,
  87,  85,  68,  48,  33,  -8,  -8, -62, -48, -88, -77, -77, -88, -33, -81,  25,
@@ -4314,7 +4314,7 @@ ALIGNED(32) const int16_t  fi_dct8_32x16_coeff_ver[256] = {
 };
 
 
-ALIGNED(32) const int16_t  ff_dct2_32x32_coeff_ver[1024] = {
+ALIGNED(32) static const int16_t  ff_dct2_32x32_coeff_ver[1024] = {
  64,  64,  90,  90,  90,  87,  90,  82,  89,  75,  88,  67,  87,  57,  85,  46,  // 0
  83,  36,  82,  22,  80,   9,  78,  -4,  75, -18,  73, -31,  70, -43,  67, -54,
  64, -64,  61, -73,  57, -80,  54, -85,  50, -89,  46, -90,  43, -90,  38, -88,
@@ -4381,7 +4381,7 @@ ALIGNED(32) const int16_t  ff_dct2_32x32_coeff_ver[1024] = {
 -83,  36,  78, -31, -70,  25,  61, -22, -50,  18,  38, -13, -25,   9,  13,  -4,
 };
 
-ALIGNED(32) const int16_t  ff_dst7_32x32_coeff_ver[1024] = {
+ALIGNED(32) static const int16_t  ff_dst7_32x32_coeff_ver[1024] = {
   4,   9,  13,  26,  21,  42,  30,  56,  38,  68,  46,  78,  53,  85,  60,  89,  // 0
  66,  90,  72,  86,  77,  80,  80,  72,  84,  60,  86,  46,  88,  30,  90,  13,
  90,  -4,  89, -21,  87, -38,  85, -53,  82, -66,  78, -77,  74, -84,  68, -88,
@@ -4448,7 +4448,7 @@ ALIGNED(32) const int16_t  ff_dst7_32x32_coeff_ver[1024] = {
 -82,  34,  77, -30, -68,  26,  60, -21, -50,  17,  38, -13, -26,   9,  13,  -4,
 };
 
-ALIGNED(32) const int16_t  ff_dct8_32x32_coeff_ver[1024] = {
+ALIGNED(32) static const int16_t  ff_dct8_32x32_coeff_ver[1024] = {
  90,  90,  90,  87,  89,  84,  88,  78,  87,  72,  86,  63,  85,  53,  84,  42,  // 0
  82,  30,  80,  17,  78,   4,  77,  -9,  74, -21,  72, -34,  68, -46,  66, -56,
  63, -66,  60, -74,  56, -80,  53, -85,  50, -88,  46, -90,  42, -89,  38, -86,
@@ -4516,13 +4516,6 @@ ALIGNED(32) const int16_t  ff_dct8_32x32_coeff_ver[1024] = {
 };
 
 
-typedef int32_t TCoeff;
-typedef int16_t TMatrixCoeff;
-
-//! \ingroup CommonLib
-//! \{
-
-
   // DCT-2
 #define DEFINE_DCT2_P2_MATRIX(a) \
 { \

From 3d4e7329528004406c894b280d283a33216a2e26 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Thu, 27 Jul 2023 12:23:32 +0300
Subject: [PATCH 247/254] [avx2] Fix issue with 16x32 inverse transform

---
 src/strategies/avx2/dct-avx2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/strategies/avx2/dct-avx2.c b/src/strategies/avx2/dct-avx2.c
index 0610162e..79517457 100644
--- a/src/strategies/avx2/dct-avx2.c
+++ b/src/strategies/avx2/dct-avx2.c
@@ -5963,7 +5963,7 @@ static void fast_inverse_tr_16x32_avx2(const int16_t* src, int16_t* dst, tr_type
   }
 
   __m256i v_ver_pass_out[32];
-  fast_inverse_tr_16x32_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height);
+  fast_inverse_tr_16x32_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, 0);
   int16_t* ver_pass_out = (int16_t*)v_ver_pass_out;
   fast_inverse_tr_16x32_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width);
 }

From 284724398e91d8f0089874a5206c85e6732c2aff Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Tue, 22 Aug 2023 14:01:58 +0300
Subject: [PATCH 248/254] Add some comments.

---
 src/strategies/avx2/dct-avx2.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/strategies/avx2/dct-avx2.c b/src/strategies/avx2/dct-avx2.c
index 79517457..dcaf68fa 100644
--- a/src/strategies/avx2/dct-avx2.c
+++ b/src/strategies/avx2/dct-avx2.c
@@ -8095,6 +8095,7 @@ static void mts_dct_avx2(
   else{
     const int log2_width_minus1  = uvg_g_convert_to_log2[width] - 1;
     const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
+    // Transforms with 1 lenght dimensions are handled separately since their interface differ from other full pass functions
     if (height == 1) {
       if (width == 16) {
         fast_forward_DCT2_B16_avx2_hor(input, (__m256i*)output, type_hor == DCT2 ? ff_dct2_16xN_coeff_hor : ff_dst7_16xN_coeff_hor, 3, 1, 0, 0);
@@ -8140,7 +8141,8 @@ static void mts_idct_avx2(
   else {
     const int log2_width_minus1  = uvg_g_convert_to_log2[width] - 1;
     const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
-        if (height == 1) {
+    // Transforms with 1 lenght dimensions can be transformed with existing forward functions
+    if (height == 1) {
       if (width == 16) {
         fast_forward_DCT2_B16_avx2_hor(input, (__m256i*)output, type_hor == DCT2 ? fi_dct2_16x1_coeff_hor : fi_dst7_16x1_coeff_hor, 13, 1, 0, 0);
         _mm256_store_si256((__m256i*)output, _mm256_permute4x64_epi64(_mm256_load_si256((__m256i*)output), _MM_SHUFFLE(3, 1, 2, 0)));

From 64d222d17c25fded6f543abf2cd22366d8ae1e7b Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Tue, 26 Sep 2023 09:42:30 +0300
Subject: [PATCH 249/254] [dep_quant] Remove dead code and fix small issue

---
 src/dep_quant.c                     | 15 +--------------
 src/strategies/avx2/depquant-avx2.c | 22 ++++------------------
 2 files changed, 5 insertions(+), 32 deletions(-)

diff --git a/src/dep_quant.c b/src/dep_quant.c
index 8513cf77..16591390 100644
--- a/src/dep_quant.c
+++ b/src/dep_quant.c
@@ -686,7 +686,7 @@ void uvg_dep_quant_update_state_eos(
       }
     }
     uint8_t* temp = &state->m_absLevels[ctxs->m_curr_state_offset / 4][(scan_pos & 15) * 4 + decision_id];
-    *temp = (uint8_t)MIN(51, decisions->absLevel[decision_id]);
+    *temp = (uint8_t)MIN(255, decisions->absLevel[decision_id]);
 
     update_common_context(ctxs, state->m_commonCtx, scan_pos, cg_pos, width_in_sbb, height_in_sbb, next_sbb_right,
                           next_sbb_below, prvState, ctxs->m_curr_state_offset + decision_id);
@@ -1037,14 +1037,6 @@ int uvg_dep_quant(
         height,
         compID != 0); //tu.cu->slice->getReverseLastSigCoeffFlag());
     }
-
-    if(0){
-      printf("%d\n", scanIdx);
-      for (int i = 0; i < 4; i++) {
-        printf("%lld %hu %d\n", ctxs->m_trellis[scanIdx].rdCost[i], ctxs->m_trellis[scanIdx].absLevel[i], ctxs->m_trellis[scanIdx].prevId[i]);
-      }
-      printf("\n");
-    }
   }
 
   //===== find best path =====
@@ -1061,11 +1053,6 @@ int uvg_dep_quant(
   //===== backward scanning =====
   int scanIdx = 0;
   context_store* ctxs = &dep_quant_context;
-  //  printf("%d\n", scanIdx);
-  //for (int i = 0; i < 4; i++) {
-  //  printf("%lld %hu %d\n", ctxs->m_trellis[scanIdx].rdCost[i], ctxs->m_trellis[scanIdx].absLevel[i], ctxs->m_trellis[scanIdx].prevId[i]);
-  //}
-  //printf("\n");
   for (; prev_id >= 0; scanIdx++) {
     Decision temp       = dep_quant_context.m_trellis[scanIdx];
     int32_t blkpos = scan[scanIdx];
diff --git a/src/strategies/avx2/depquant-avx2.c b/src/strategies/avx2/depquant-avx2.c
index 5ef1936e..ddca134e 100644
--- a/src/strategies/avx2/depquant-avx2.c
+++ b/src/strategies/avx2/depquant-avx2.c
@@ -480,10 +480,6 @@ static void xDecide(
   PQData pqData;
   preQuantCoeff(qp, absCoeff, &pqData, quanCoeff);
   check_rd_costs_avx2(all_states, spt, &pqData, decisions, prev_offset);
-  //uvg_dep_quant_check_rd_costs(all_states, spt, &pqData, decisions, 0, 2, prev_offset + 0);
-  //uvg_dep_quant_check_rd_costs(all_states, spt, &pqData, decisions, 2, 0, prev_offset + 1);
-  //uvg_dep_quant_check_rd_costs(all_states, spt, &pqData, decisions, 1, 3, prev_offset + 2);
-  //uvg_dep_quant_check_rd_costs(all_states, spt, &pqData, decisions, 3, 1, prev_offset + 3);
   if (spt == SCAN_EOCSBB) {
     checkRdCostSkipSbb(all_states, decisions, 0, skip_offset);
     checkRdCostSkipSbb(all_states, decisions, 1, skip_offset);
@@ -594,7 +590,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
       prev_state = _mm_loadu_si128((__m128i const*)prev_state_s);
     }
     uint32_t level_offset = scan_pos & 15;
-    __m128i  max_abs = _mm_min_epi32(abs_level, _mm_set1_epi32(51));
+    __m128i  max_abs = _mm_min_epi32(abs_level, _mm_set1_epi32(255));
     max_abs = _mm_shuffle_epi8(max_abs, control);
     uint32_t packed_max_abs = _mm_extract_epi32(max_abs, 0);
     memcpy(&state->m_absLevels[state_offset >> 2][level_offset * 4], &packed_max_abs, 4);
@@ -1073,7 +1069,6 @@ static INLINE void update_states_avx2(
           state->m_numSigSbb[state_id] = 1;
           state->m_refSbbCtxId[state_id] = -1;
           int ctxBinSampleRatio = 28;
-          //(scanInfo.chType == CHANNEL_TYPE_LUMA) ? MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_LUMA : MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_CHROMA;
           state->m_remRegBins[state_id] = (state->effWidth * state->effHeight * ctxBinSampleRatio) / 16 - (decisions->absLevel[decision_id] < 2 ? (unsigned)decisions->absLevel[decision_id] : 3);
         }
         rem_reg_all_gte_4 &= state->m_remRegBins[state_id] >= 4;
@@ -1124,7 +1119,7 @@ static INLINE void update_states_avx2(
       }
     }
     uint32_t level_offset   = scan_pos & 15;
-    __m128i  max_abs        = _mm_min_epi32(abs_level, _mm_set1_epi32(51));
+    __m128i  max_abs        = _mm_min_epi32(abs_level, _mm_set1_epi32(255));
     max_abs                 = _mm_shuffle_epi8(max_abs, control);
     uint32_t packed_max_abs = _mm_extract_epi32(max_abs, 0);
     memcpy(&state->m_absLevels[state_offset >> 2][level_offset * 4], &packed_max_abs,4);
@@ -1230,7 +1225,7 @@ static INLINE void update_states_avx2(
       }
 
       __m128i sum_abs = _mm_srli_epi32(tinit, 8);
-      sum_abs = _mm_min_epi32(sum_abs, _mm_set1_epi32(51));
+      sum_abs = _mm_min_epi32(sum_abs, _mm_set1_epi32(255));
       switch (numIPos) {
         case 5:
           {
@@ -1283,7 +1278,7 @@ static INLINE void update_states_avx2(
       __m128i   tinit = _mm_loadu_si128((__m128i*)(&state->m_ctxInit[state_offset >> 2][tinit_offset * 4]));
       tinit = _mm_cvtepi16_epi32(tinit); 
       __m128i sum_abs = _mm_srli_epi32(tinit, 8);
-      sum_abs         = _mm_min_epi32(sum_abs, _mm_set1_epi32(51));
+      sum_abs         = _mm_min_epi32(sum_abs, _mm_set1_epi32(255));
       switch (numIPos) {
         case 5:
           {
@@ -1465,15 +1460,6 @@ void uvg_dep_quant_decide_and_update_avx2(
     } else if (!zeroOut) {
       update_states_avx2(ctxs, next_nb_info_ssb.num, scan_pos, decisions, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], next_nb_info_ssb, 4, false);
     }
-    //for (int i = 0; i<4; i++) {
-    //  for (int k = 0; k < 16; ++k) {
-    //    printf(
-    //      "%3d ",
-    //      ctxs->m_allStates.m_absLevels[ctxs->m_curr_state_offset / 4][k * 4 + i]);
-    //  }
-    //  printf("\n");
-    //}
-    //printf("\n");
 
     if (spt == SCAN_SOCSBB) {
       SWAP(ctxs->m_skip_state_offset, ctxs->m_prev_state_offset, int);

From ff77346527b47e645087391f117b620d1e31b5e0 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Tue, 26 Sep 2023 09:57:47 +0300
Subject: [PATCH 250/254] [dct2] Remove unnecessary memsets

---
 src/strategies/avx2/dct-avx2.c      | 130 +---------------------------
 src/strategies/avx2/depquant-avx2.c |   1 -
 2 files changed, 3 insertions(+), 128 deletions(-)

diff --git a/src/strategies/avx2/dct-avx2.c b/src/strategies/avx2/dct-avx2.c
index dcaf68fa..081b1b25 100644
--- a/src/strategies/avx2/dct-avx2.c
+++ b/src/strategies/avx2/dct-avx2.c
@@ -52,7 +52,9 @@ extern const int16_t uvg_g_dct_8_t[8][8];
 extern const int16_t uvg_g_dct_16_t[16][16];
 extern const int16_t uvg_g_dct_32_t[32][32];
 
-#if COMPILE_INTEL_AVX2
+#define COMPILE_INTEL_AVX2 1
+
+#if COMPILE_INTEL_AVX2 
 #include "uvg266.h"
 #if UVG_BIT_DEPTH == 8
 #include <immintrin.h>
@@ -1739,22 +1741,7 @@ static void mts_dct_16x16_avx2(const int16_t* input, int16_t* output, tr_type_t
 
   const int skip_line = lfnst_idx ? 8 : 0;
   const int skip_line2 = lfnst_idx ? 8 : 0;
-  if (skip_line)
-  {
-    const int reduced_line = 8, cutoff = 8;
-    int16_t* dst2 = output + reduced_line;
-    for (int j = 0; j < cutoff; j++)
-    {
-      memset(dst2, 0, sizeof(int16_t) * skip_line);
-      dst2 += 16;
-    }
-  }
 
-  if (skip_line2)
-  {
-    int16_t* dst2 = output + 16 * 8;
-    memset(dst2, 0, sizeof(int16_t) * 16 * skip_line2);
-  }
 }
 
 /**********/
@@ -1942,21 +1929,7 @@ static void mul_clip_matrix_32x32_mts_avx2(const int16_t* left,
     _mm256_store_si256(dst_v + dst_base + 1, h23);
   }
   
-  if (skip_line)
-  {
-    int16_t* dst2 = dst + reduced_line;
-    for (j = 0; j < cutoff; j++)
-    {
-      memset(dst2, 0, sizeof(int16_t) * skip_line);
-      dst2 += 32;
-    }
-  }
 
-  if (skip_line2)
-  {
-    int16_t* dst2 = dst + 32 * cutoff;
-    memset(dst2, 0, sizeof(int16_t) * 32 * skip_line2);
-  }
 }
 
 static void mts_dct_32x32_avx2(const int16_t* input, int16_t* output, tr_type_t type_hor, tr_type_t type_ver, uint8_t bitdepth, uint8_t lfnst_idx)
@@ -3283,19 +3256,7 @@ static void fast_forward_tr_4x32_avx2(const int16_t* src, int16_t* dst, tr_type_
   }
   transpose_avx2(temp_out, (__m256i*) dst, 32, 4);
 
-  if (skip_width) {
-    dst = p_dst + reduced_line;
-    for (int j = 0; j < cutoff; j++)
-    {
-      memset(dst, 0, sizeof(int16_t) * skip_width);
-      dst += width;
-    }
-  }
 
-  if (skip_height) {
-    dst = p_dst + width * cutoff;
-    memset(dst, 0, sizeof(int16_t) * width * skip_height);
-  }
 }
 
 
@@ -4448,19 +4409,7 @@ static void fast_forward_tr_8x32_avx2(const int16_t* src, int16_t* dst, tr_type_
   transpose_avx2(temp_out, (__m256i*) dst, 32, 8);
 #undef NUM_PARTS
 #undef PART_DIMENSION
-  if (skip_width) {
-    dst = p_dst + reduced_line;
-    for (int j = 0; j < cutoff; j++)
-    {
-      memset(dst, 0, sizeof(int16_t) * skip_width);
-      dst += width;
-    }
-  }
 
-  if (skip_height) {
-    dst = p_dst + width * cutoff;
-    memset(dst, 0, sizeof(int16_t) * width * skip_height);
-  }
 }
 
 
@@ -5850,19 +5799,6 @@ static void fast_forward_tr_16x32_avx2(const int16_t* src, int16_t* dst, tr_type
 #undef PART_DIMENSION
 #endif
 
-  if (skip_width) {
-    dst = p_dst + reduced_line;
-    for (int j = 0; j < cutoff; j++)
-    {
-      memset(dst, 0, sizeof(int16_t) * skip_width);
-      dst += width;
-    }
-  }
-
-  if (skip_height) {
-    dst = p_dst + width * cutoff;
-    memset(dst, 0, sizeof(int16_t) * width * skip_height);
-  }
 }
 
 
@@ -6273,15 +6209,6 @@ static void fast_forward_DCT2_32x8_avx2_ver(const __m256i* src, int16_t* dst, in
     dst += 16;
   }
 
-  if (skip_line)
-  {
-    dst = p_dst + reduced_line;
-    for (int j = 0; j < 8; j++)
-    {
-      memset(dst, 0, sizeof(int16_t) * skip_line);
-      dst += line;
-    }
-  }
 }
 
 
@@ -6565,19 +6492,6 @@ static void fast_forward_tr_32x4_avx2(const int16_t* src, int16_t* dst, tr_type_
     dst += 32;
   }
 
-  if (skip_width) {
-    dst = p_dst + reduced_line;
-    for (int j = 0; j < cutoff; j++)
-    {
-      memset(dst, 0, sizeof(int16_t) * skip_width);
-      dst += width;
-    }
-  }
-
-  if (skip_height) {
-    dst = p_dst + width * cutoff;
-    memset(dst, 0, sizeof(int16_t) * width * skip_height);
-  }
 }
 
 
@@ -7034,19 +6948,7 @@ static void fast_forward_tr_32x8_avx2(const int16_t* src, int16_t* dst, tr_type_
   }
 #undef NUM_PARTS
 #undef PART_DIMENSION
-  if (skip_width) {
-    dst = p_dst + reduced_line;
-    for (int j = 0; j < cutoff; j++)
-    {
-      memset(dst, 0, sizeof(int16_t) * skip_width);
-      dst += width;
-    }
-  }
 
-  if (skip_height) {
-    dst = p_dst + width * cutoff;
-    memset(dst, 0, sizeof(int16_t) * width * skip_height);
-  }
 }
 
 
@@ -7366,19 +7268,6 @@ static void fast_forward_tr_32x16_avx2(const int16_t* src, int16_t* dst, tr_type
   }
 #undef NUM_PARTS
 #undef PART_DIMENSION
-  if (skip_width) {
-    dst = p_dst + reduced_line;
-    for (int j = 0; j < cutoff; j++)
-    {
-      memset(dst, 0, sizeof(int16_t) * skip_width);
-      dst += width;
-    }
-  }
-
-  if (skip_height) {
-    dst = p_dst + width * cutoff;
-    memset(dst, 0, sizeof(int16_t) * width * skip_height);
-  }
 }
 
 
@@ -7838,19 +7727,6 @@ static void fast_forward_tr_32x32_avx2(const int16_t* src, int16_t* dst, tr_type
 #undef PART_DIMENSION
 #endif
 
-  if (skip_width) {
-    dst = p_dst + reduced_line;
-    for (int j = 0; j < cutoff; j++)
-    {
-      memset(dst, 0, sizeof(int16_t) * skip_width);
-      dst += width;
-    }
-  }
-
-  if (skip_height) {
-    dst = p_dst + width * cutoff;
-    memset(dst, 0, sizeof(int16_t) * width * skip_height);
-  }
 }
 
 
diff --git a/src/strategies/avx2/depquant-avx2.c b/src/strategies/avx2/depquant-avx2.c
index ddca134e..b393bce6 100644
--- a/src/strategies/avx2/depquant-avx2.c
+++ b/src/strategies/avx2/depquant-avx2.c
@@ -37,7 +37,6 @@
 #include "strategies/avx2/depquant-avx2.h"
 #include "strategyselector.h"
 
-#define COMPILE_INTEL_AVX2 1
 
 #if COMPILE_INTEL_AVX2 && defined X86_64
 #include "dep_quant.h"

From e32cf4fb522b4c3945a885e96bb825441f3af7df Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Tue, 26 Sep 2023 10:38:29 +0300
Subject: [PATCH 251/254] [avx2] Re-enable disabled avx2 functions that do not
 work with non-square blocks

---
 src/strategies/avx2/intra-avx2.c   |  8 ++++----
 src/strategies/avx2/picture-avx2.c | 16 ++++++++--------
 src/strategies/avx2/quant-avx2.c   |  2 +-
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c
index 30bbe7f2..838bad91 100644
--- a/src/strategies/avx2/intra-avx2.c
+++ b/src/strategies/avx2/intra-avx2.c
@@ -1075,10 +1075,10 @@ int uvg_strategy_register_intra_avx2(void* opaque, uint8_t bitdepth)
 #if COMPILE_INTEL_AVX2 && defined X86_64
 #if UVG_BIT_DEPTH == 8
   if (bitdepth == 8) {
-    //success &= uvg_strategyselector_register(opaque, "angular_pred", "avx2", 40, &uvg_angular_pred_avx2);
-    //success &= uvg_strategyselector_register(opaque, "intra_pred_planar", "avx2", 40, &uvg_intra_pred_planar_avx2);
-    //success &= uvg_strategyselector_register(opaque, "intra_pred_filtered_dc", "avx2", 40, &uvg_intra_pred_filtered_dc_avx2);
-    //success &= uvg_strategyselector_register(opaque, "pdpc_planar_dc", "avx2", 40, &uvg_pdpc_planar_dc_avx2);
+    success &= uvg_strategyselector_register(opaque, "angular_pred", "avx2", 40, &uvg_angular_pred_avx2);
+    success &= uvg_strategyselector_register(opaque, "intra_pred_planar", "avx2", 40, &uvg_intra_pred_planar_avx2);
+    success &= uvg_strategyselector_register(opaque, "intra_pred_filtered_dc", "avx2", 40, &uvg_intra_pred_filtered_dc_avx2);
+    success &= uvg_strategyselector_register(opaque, "pdpc_planar_dc", "avx2", 40, &uvg_pdpc_planar_dc_avx2);
   }
 #endif //UVG_BIT_DEPTH == 8
 #endif //COMPILE_INTEL_AVX2 && defined X86_64
diff --git a/src/strategies/avx2/picture-avx2.c b/src/strategies/avx2/picture-avx2.c
index f8be4987..26eb535e 100644
--- a/src/strategies/avx2/picture-avx2.c
+++ b/src/strategies/avx2/picture-avx2.c
@@ -1808,15 +1808,15 @@ int uvg_strategy_register_picture_avx2(void* opaque, uint8_t bitdepth)
     success &= uvg_strategyselector_register(opaque, "satd_32x32", "avx2", 40, &satd_32x32_8bit_avx2);
     success &= uvg_strategyselector_register(opaque, "satd_64x64", "avx2", 40, &satd_64x64_8bit_avx2);
 
-    //success &= uvg_strategyselector_register(opaque, "satd_4x4_dual", "avx2", 40, &satd_8bit_4x4_dual_avx2);
-    //success &= uvg_strategyselector_register(opaque, "satd_8x8_dual", "avx2", 40, &satd_8bit_8x8_dual_avx2);
-    //success &= uvg_strategyselector_register(opaque, "satd_16x16_dual", "avx2", 40, &satd_8bit_16x16_dual_avx2);
-    //success &= uvg_strategyselector_register(opaque, "satd_32x32_dual", "avx2", 40, &satd_8bit_32x32_dual_avx2);
-    //success &= uvg_strategyselector_register(opaque, "satd_64x64_dual", "avx2", 40, &satd_8bit_64x64_dual_avx2);
-    //success &= uvg_strategyselector_register(opaque, "satd_any_size", "avx2", 40, &satd_any_size_8bit_avx2);
-    //success &= uvg_strategyselector_register(opaque, "satd_any_size_quad", "avx2", 40, &satd_any_size_quad_avx2);
+    success &= uvg_strategyselector_register(opaque, "satd_4x4_dual", "avx2", 40, &satd_8bit_4x4_dual_avx2);
+    success &= uvg_strategyselector_register(opaque, "satd_8x8_dual", "avx2", 40, &satd_8bit_8x8_dual_avx2);
+    success &= uvg_strategyselector_register(opaque, "satd_16x16_dual", "avx2", 40, &satd_8bit_16x16_dual_avx2);
+    success &= uvg_strategyselector_register(opaque, "satd_32x32_dual", "avx2", 40, &satd_8bit_32x32_dual_avx2);
+    success &= uvg_strategyselector_register(opaque, "satd_64x64_dual", "avx2", 40, &satd_8bit_64x64_dual_avx2);
+    success &= uvg_strategyselector_register(opaque, "satd_any_size", "avx2", 40, &satd_any_size_8bit_avx2);
+    success &= uvg_strategyselector_register(opaque, "satd_any_size_quad", "avx2", 40, &satd_any_size_quad_avx2);
 
-    //success &= uvg_strategyselector_register(opaque, "pixels_calc_ssd", "avx2", 40, &pixels_calc_ssd_avx2);
+    success &= uvg_strategyselector_register(opaque, "pixels_calc_ssd", "avx2", 40, &pixels_calc_ssd_avx2);
     success &= uvg_strategyselector_register(opaque, "bipred_average", "avx2", 40, &bipred_average_avx2);
     success &= uvg_strategyselector_register(opaque, "get_optimized_sad", "avx2", 40, &get_optimized_sad_avx2);
     success &= uvg_strategyselector_register(opaque, "ver_sad", "avx2", 40, &ver_sad_avx2);
diff --git a/src/strategies/avx2/quant-avx2.c b/src/strategies/avx2/quant-avx2.c
index bd857fa2..7729d272 100644
--- a/src/strategies/avx2/quant-avx2.c
+++ b/src/strategies/avx2/quant-avx2.c
@@ -960,7 +960,7 @@ int uvg_strategy_register_quant_avx2(void* opaque, uint8_t bitdepth)
 #if COMPILE_INTEL_AVX2 && defined X86_64
 #if UVG_BIT_DEPTH == 8
   if (bitdepth == 8) {
-    //success &= uvg_strategyselector_register(opaque, "quantize_residual", "avx2", 40, &uvg_quantize_residual_avx2);
+    success &= uvg_strategyselector_register(opaque, "quantize_residual", "avx2", 40, &uvg_quantize_residual_avx2);
     success &= uvg_strategyselector_register(opaque, "dequant", "avx2", 40, &uvg_dequant_avx2);
   }
 #endif // UVG_BIT_DEPTH == 8

From 69c1c948fad80e6452f08c3c06980be6ad85e510 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Tue, 26 Sep 2023 10:41:31 +0300
Subject: [PATCH 252/254] [cfg] Specify that MTT and ISP are currently
 experimental

---
 src/cli.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/cli.c b/src/cli.c
index b7c56efb..6e66f77e 100644
--- a/src/cli.c
+++ b/src/cli.c
@@ -630,6 +630,8 @@ void print_help(void)
     "      --mtt-depth-intra-chroma : Depth of mtt for chroma dual tree in\n"
     "                                      intra slices 0..3.[0]\n"
     "      --mtt-depth-inter      : Depth of mtt for inter slices 0..3.[0]\n"
+    "                              All MTTs are currently experimental and\n"
+    "                              require disabling some avx2 optimizations.\n"
     "      --max-bt-size          : maximum size for a CU resulting from\n"
     "                                   a bt split. A singular value shared for all\n"
     "                                   or a list of three values for the different\n"
@@ -694,6 +696,8 @@ void print_help(void)
     "      --(no-)lfnst           : Enable low frequency non-separable transform.\n"
     "                                 [disabled]\n"
     "      --(no-)isp             : Enable intra sub partitions. [disabled]\n"
+    "                               Experimental, requires disabling some avx2\n"
+    "                               optimizations.\n"
     "      --mts <string>         : Multiple Transform Selection [off].\n"
     "                               (Currently only implemented for intra\n"
     "                               and has effect only when rd >= 2)\n"

From 079d7e9a1a2ab33c06b524b75edf7d1cadecc26c Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Tue, 26 Sep 2023 11:36:43 +0300
Subject: [PATCH 253/254] [tests] Fix mts_tests.c to not consider irrelevant
 elements

---
 tests/mts_tests.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/tests/mts_tests.c b/tests/mts_tests.c
index b417aa35..61f9fb2c 100644
--- a/tests/mts_tests.c
+++ b/tests/mts_tests.c
@@ -158,6 +158,7 @@ TEST dct(void)
 {
   char testname[100];
   for (int blocksize = 0; blocksize < NUM_SIZES; blocksize++) {
+    size_t size = 1 << (LCU_MIN_LOG_W + blocksize);
     for (int trafo = 0; trafo < NUM_TRANSFORM; trafo++) {      
       sprintf(testname, "Block: %d x %d, trafo: %d", 1 << (LCU_MIN_LOG_W + blocksize), 1 << (LCU_MIN_LOG_W + blocksize), trafo);
       cu_info_t tu;
@@ -172,8 +173,13 @@ TEST dct(void)
 
       test_env.tested_func(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + blocksize), 1 << (LCU_MIN_LOG_W + blocksize), buf, test_result, UVG_MTS_BOTH);
 
-      for (int i = 0; i < LCU_WIDTH * LCU_WIDTH; ++i) {
-        ASSERT_EQm(testname, test_result[i], dct_result[trafo][blocksize][i]);
+      for (int y = 0; y < size; ++y) {
+        if (y>= 16) break;
+        for (int x = 0; x < size; ++x) {
+          if (x >= 16) break;
+          int i = y * size + x;
+          ASSERT_EQm(testname, test_result[i], dct_result[trafo][blocksize][i]);
+        }
       }
       //fprintf(stderr, "PASS: %s\r\n", testname);
     }

From 4a1cd926fbe587eaea581fb9114e28fec3f62282 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Tue, 26 Sep 2023 11:47:34 +0300
Subject: [PATCH 254/254] [rdoq] Fix rdoq using uninitialized values that do
 not matter

---
 src/rdo.c                              | 89 +++++++++++++++++++++-----
 src/rdo.h                              |  2 +-
 src/strategies/avx2/quant-avx2.c       |  2 +-
 src/strategies/generic/quant-generic.c |  4 +-
 src/transform.c                        |  4 +-
 5 files changed, 80 insertions(+), 21 deletions(-)

diff --git a/src/rdo.c b/src/rdo.c
index cfc03c48..c5d1c71b 100644
--- a/src/rdo.c
+++ b/src/rdo.c
@@ -843,28 +843,28 @@ void uvg_rdoq_sign_hiding(
   }
 }
 
-static unsigned templateAbsSum(const coeff_t* coeff, int baseLevel, uint32_t  posX, uint32_t  posY, uint32_t width, uint32_t height)
+static unsigned templateAbsSum(const coeff_t* coeff, int baseLevel, uint32_t  posX, uint32_t  posY, uint32_t width, uint32_t height, uint8_t mts_index)
 {
   const coeff_t* pData = coeff + posX + posY * width;
   coeff_t          sum = 0;
   if (posX < width - 1)
   {
-    sum += abs(pData[1]);
+    sum += mts_index && posX + 1 >= 16 ? 0 : abs(pData[1]);
     if (posX < width - 2)
     {
-      sum += abs(pData[2]);
+      sum += mts_index && posX + 2 >= 16 ? 0 : abs(pData[2]);
     }
     if (posY < height - 1)
     {
-      sum += abs(pData[width + 1]);
+      sum += mts_index && (posY + 1 >= 16 || posX + 1 >= 16) ? 0 : abs(pData[width + 1]);
     }
   }
   if (posY < height - 1)
   {
-    sum += abs(pData[width]);
+    sum += mts_index && posY + 1 >= 16 ? 0 : abs(pData[width]);
     if (posY < height - 2)
     {
-      sum += abs(pData[width << 1]);
+      sum += mts_index && posY + 2 >= 16 ? 0 : abs(pData[width << 1]);
     }
   }
   return MAX(MIN(sum - 5 * baseLevel, 31), 0);
@@ -1398,6 +1398,48 @@ int uvg_ts_rdoq(encoder_state_t* const state, coeff_t* src_coeff, coeff_t* dest_
   return abs_sum;
 }
 
+
+static uint32_t context_get_sig_ctx_idx_abs(const coeff_t* coeff, uint32_t pos_x, uint32_t pos_y,
+                                            uint32_t width, uint32_t height, int8_t color,
+                                            int32_t* temp_diag, int32_t* temp_sum, int8_t mts)
+{
+  const coeff_t* data = coeff + pos_x + pos_y * width;
+  const int     diag = pos_x + pos_y;
+  int           num_pos = 0;
+  int           sum_abs = 0;
+#define UPDATE(x) {int a=abs(x);sum_abs+=MIN(4+(a&1),a);num_pos+=(a?1:0);}
+  if (pos_x < width - 1)
+  {
+    UPDATE(mts && pos_x + 1 >= 16 ? 0 : data[1]);
+    if (pos_x < width - 2)
+    {
+      UPDATE(mts && pos_x + 2 >= 16 ? 0 : data[2]);
+    }
+    if (pos_y < height - 1)
+    {
+      UPDATE(mts && (pos_y + 1 >= 16 || pos_x + 1 >= 16) ? 0 : data[width + 1]);
+    }
+  }
+  if (pos_y < height - 1)
+  {
+    UPDATE(mts && pos_x + 1 >= 16 ? 0 : data[width]);
+    if (pos_y < height - 2)
+    {
+      UPDATE(mts && pos_x + 2 >= 16 ? 0 : data[width << 1]);
+    }
+  }
+#undef UPDATE
+  int ctx_ofs = MIN((sum_abs + 1) >> 1, 3) + (diag < 2 ? 4 : 0);
+  if (color == COLOR_Y)
+  {
+    ctx_ofs += diag < 5 ? 4 : 0;
+  }
+
+  *temp_diag = diag;
+  *temp_sum = sum_abs - num_pos;
+  return ctx_ofs;
+}
+
 /** RDOQ with CABAC
  * \returns void
  * Rate distortion optimized quantization for entropy
@@ -1414,7 +1456,7 @@ void uvg_rdoq(
   int8_t scan_mode,
   int8_t block_type,
   uint16_t cbf,
-  uint8_t lfnst_idx)
+  uint8_t lfnst_idx, uint8_t mts_idx)
 {
   const encoder_control_t * const encoder = state->encoder_control;
   cabac_data_t * const cabac = &state->cabac;
@@ -1516,6 +1558,10 @@ void uvg_rdoq(
   uint32_t  max_scan_group_size = lfnst_idx > 0 ? max_lfnst_pos : cg_size - 1;
   for (cg_scanpos = (cg_num - 1); cg_scanpos >= 0; cg_scanpos--)
   {
+    uint32_t cg_blkpos = scan_cg[cg_scanpos];
+    uint32_t cg_pos_y = cg_blkpos / num_blk_side;
+    uint32_t cg_pos_x = cg_blkpos - (cg_pos_y * num_blk_side);
+    if (mts_idx != 0 && (cg_pos_y >= 4 || cg_pos_x >= 4)) continue;
     for (int32_t scanpos_in_cg = max_scan_group_size; scanpos_in_cg >= 0; scanpos_in_cg--)
     {
       int32_t  scanpos        = cg_scanpos*cg_size + scanpos_in_cg;
@@ -1558,6 +1604,7 @@ void uvg_rdoq(
     uint32_t cg_pos_x   = cg_blkpos - (cg_pos_y * num_blk_side);
 
     FILL(rd_stats, 0);
+    if (mts_idx != 0 && (cg_pos_y >= 4 || cg_pos_x >= 4)) continue;
     for (int32_t scanpos_in_cg = max_scan_group_size; scanpos_in_cg >= 0; scanpos_in_cg--)  {
       int32_t  scanpos = cg_scanpos*cg_size + scanpos_in_cg;
       if (scanpos > last_scanpos) {
@@ -1586,7 +1633,7 @@ void uvg_rdoq(
         uint16_t ctx_sig = 0;
         if (scanpos != last_scanpos) {
           // VVC document 9.3.4.2.8, context for sig_coeff_flag calculated here
-          ctx_sig = uvg_context_get_sig_ctx_idx_abs(dest_coeff, pos_x, pos_y, width, height, color, &temp_diag, &temp_sum);
+          ctx_sig = context_get_sig_ctx_idx_abs(dest_coeff, pos_x, pos_y, width, height, color, &temp_diag, &temp_sum, mts_idx);
         }
         
         if (temp_diag != -1) {
@@ -1595,7 +1642,7 @@ void uvg_rdoq(
         else ctx_set = 0;
 
         if (reg_bins < 4) {
-          int  sumAll = templateAbsSum(dest_coeff, 0, pos_x, pos_y, width, height);
+          int  sumAll = templateAbsSum(dest_coeff, 0, pos_x, pos_y, width, height,mts_idx);
           go_rice_param = g_auiGoRiceParsCoeff[sumAll];
         }
 
@@ -1647,7 +1694,7 @@ void uvg_rdoq(
         }
         else if (reg_bins >= 4) {
           reg_bins -= (level < 2 ? level : 3) + (scanpos != last_scanpos);
-          int  sumAll = templateAbsSum(coef, 4, pos_x, pos_y, width, height);
+          int  sumAll = templateAbsSum(coef, 4, pos_x, pos_y, width, height, mts_idx);
           go_rice_param = g_auiGoRiceParsCoeff[sumAll];
         }
       }
@@ -1792,11 +1839,23 @@ void uvg_rdoq(
   } // end for
 
   uint32_t abs_sum = 0;
-  for ( int32_t scanpos = 0; scanpos < best_last_idx_p1; scanpos++) {
-    int32_t blkPos     = scan[scanpos];
-    int32_t level      = dest_coeff[blkPos];
-    abs_sum            += level;
-    dest_coeff[blkPos] = (coeff_t)(( coef[blkPos] < 0 ) ? -level : level);
+  if(!mts_idx || (width < 32 && height < 32)) {
+    for ( int32_t scanpos = 0; scanpos < best_last_idx_p1; scanpos++) {
+      int32_t blkPos     = scan[scanpos];
+      int32_t level      = dest_coeff[blkPos];
+      abs_sum            += level;
+      dest_coeff[blkPos] = (coeff_t)(( coef[blkPos] < 0 ) ? -level : level);
+    }
+  }
+  else {
+    for ( int32_t scanpos = 0; scanpos < best_last_idx_p1; scanpos++) {
+      int32_t blkPos     = scan[scanpos];
+      int32_t blk_x = blkPos & (width - 1);
+      int32_t blk_y = blkPos >> log2_block_width;
+      int32_t level      = blk_x >= 16 || blk_y >= 16 ? 0 : dest_coeff[blkPos];
+      abs_sum            += level;
+      dest_coeff[blkPos] = (coeff_t)(( level < 0 ) ? -level : level);
+    }
   }
   //===== clean uncoded coefficients =====
   for ( int32_t scanpos = best_last_idx_p1; scanpos <= last_scanpos; scanpos++) {
diff --git a/src/rdo.h b/src/rdo.h
index 9aa2d425..2ba0c2a9 100644
--- a/src/rdo.h
+++ b/src/rdo.h
@@ -63,7 +63,7 @@ void  uvg_rdoq(
   int8_t scan_mode,
   int8_t block_type,
   uint16_t cbf,
-  uint8_t lfnst_idx);
+  uint8_t lfnst_idx, uint8_t mts_idx);
 
 
 int uvg_ts_rdoq(encoder_state_t* const state, coeff_t* src_coeff, coeff_t* dest_coeff, int32_t width,
diff --git a/src/strategies/avx2/quant-avx2.c b/src/strategies/avx2/quant-avx2.c
index 7729d272..cada96f1 100644
--- a/src/strategies/avx2/quant-avx2.c
+++ b/src/strategies/avx2/quant-avx2.c
@@ -725,7 +725,7 @@ int uvg_quantize_residual_avx2(encoder_state_t *const state,
       (width > 4 || !state->encoder_control->cfg.rdoq_skip) && !use_trskip)
   {
     uvg_rdoq(state, coeff, coeff_out, width, height, color,
-      scan_order, cur_cu->type, cur_cu->cbf, lfnst_index);
+      scan_order, cur_cu->type, cur_cu->cbf, lfnst_index, color == 0 ? cur_cu->tr_idx : 0);
   }
   else if (state->encoder_control->cfg.rdoq_enable && use_trskip) {
     uvg_ts_rdoq(state, coeff, coeff_out, width, height, color,
diff --git a/src/strategies/generic/quant-generic.c b/src/strategies/generic/quant-generic.c
index ceb6b7aa..e39b6c52 100644
--- a/src/strategies/generic/quant-generic.c
+++ b/src/strategies/generic/quant-generic.c
@@ -334,7 +334,7 @@ int uvg_quant_cbcr_residual_generic(
     (width > 4 || !state->encoder_control->cfg.rdoq_skip))
   {
     uvg_rdoq(state, coeff, coeff_out, width, height, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U,
-             scan_order, cur_cu->type, cur_cu->cbf, lfnst_idx);
+             scan_order, cur_cu->type, cur_cu->cbf, lfnst_idx, 0);
   }
   else if (state->encoder_control->cfg.rdoq_enable && false) {
     uvg_ts_rdoq(state, coeff, coeff_out, width, height, cur_cu->joint_cb_cr == 2 ? COLOR_V : COLOR_U,
@@ -528,7 +528,7 @@ int uvg_quantize_residual_generic(encoder_state_t *const state,
       (width > 4 || !state->encoder_control->cfg.rdoq_skip) && !use_trskip)
   {
     uvg_rdoq(state, coeff, coeff_out, width, height, color,
-             scan_order, cur_cu->type, cur_cu->cbf, lfnst_index);
+             scan_order, cur_cu->type, cur_cu->cbf, lfnst_index, color == 0 ? cur_cu->tr_idx : 0);
   } else if(state->encoder_control->cfg.rdoq_enable && use_trskip) {
     uvg_ts_rdoq(state, coeff, coeff_out, width, height, color,
       scan_order);
diff --git a/src/transform.c b/src/transform.c
index 77834072..98728da0 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -507,7 +507,7 @@ static void quantize_chroma(
     (transform != CHROMA_TS || !state->encoder_control->cfg.rdoq_skip))
   {
     uvg_rdoq(state, u_coeff, u_quant_coeff, width, height, transform != JCCR_1 ? COLOR_U : COLOR_V,
-             scan_order, CU_INTRA, 0, lfnst_idx);
+             scan_order, CU_INTRA, 0, lfnst_idx, 0);
 
     int j;
     for (j = 0; j < width * height; ++j) {
@@ -521,7 +521,7 @@ static void quantize_chroma(
       uint16_t temp_cbf = 0;
       if (*u_has_coeffs)cbf_set(&temp_cbf, COLOR_U);
       uvg_rdoq(state, v_coeff, v_quant_coeff, width, height, COLOR_V,
-               scan_order, CU_INTRA, temp_cbf, lfnst_idx);
+               scan_order, CU_INTRA, temp_cbf, lfnst_idx, 0);
 
     }
   }